From 69f3721a36d20e83f9282cc7ff8f9d8154a3a59c Mon Sep 17 00:00:00 2001
From: chezhang <1376507468@qq.com>
Date: Thu, 4 Sep 2025 14:55:53 +0800
Subject: [PATCH 001/121] [fix] fix fail test when backend is mack

---
 .../batch_norm_kernel_register.cc             |   10 +-
 .../conv_transpose_grad_kernel_register.cu    |   40 -
 .../conv_transpose_grad_kernel_register.cu    | 1114 +++++++++++++++++
 .../impl/spectral_norm_grad_kernel_impl.h     |  130 --
 .../kernels/impl/spectral_norm_kernel_impl.h  |  182 ---
 backends/metax_gpu/kernels/metax_context.cc   |    1 +
 backends/metax_gpu/kernels/metax_context.h    |    1 +
 .../instance_norm_grad_kerne_registerl.cu     |  650 ++++++++++
 .../instance_norm_kernel_register.cu          |  253 ++++
 .../spectral_norm_grad_kernel_register.cu     |   22 +
 .../spectral_norm_kernel_register.cu          |   22 +
 backends/metax_gpu/patch/paddle.patch         |  462 +++++++
 12 files changed, 2534 insertions(+), 353 deletions(-)
 delete mode 100644 backends/metax_gpu/kernels/cuda_kernels/conv_transpose_grad_kernel_register.cu
 create mode 100644 backends/metax_gpu/kernels/gpudnn/conv_transpose_grad_kernel_register.cu
 delete mode 100644 backends/metax_gpu/kernels/impl/spectral_norm_grad_kernel_impl.h
 delete mode 100644 backends/metax_gpu/kernels/impl/spectral_norm_kernel_impl.h
 create mode 100644 backends/metax_gpu/kernels/metax_kernel/instance_norm_grad_kerne_registerl.cu
 create mode 100644 backends/metax_gpu/kernels/metax_kernel/instance_norm_kernel_register.cu
 create mode 100644 backends/metax_gpu/kernels/metax_kernel/spectral_norm_grad_kernel_register.cu
 create mode 100644 backends/metax_gpu/kernels/metax_kernel/spectral_norm_kernel_register.cu

diff --git a/backends/metax_gpu/kernels/cuda_kernels/batch_norm_kernel_register.cc b/backends/metax_gpu/kernels/cuda_kernels/batch_norm_kernel_register.cc
index b12f208bec0..ac3d8b95062 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/batch_norm_kernel_register.cc
+++ b/backends/metax_gpu/kernels/cuda_kernels/batch_norm_kernel_register.cc
@@ -20,4 +20,12 @@ PD_CUSTOM_KERNEL_REGISTER(batch_norm_infer,
                           ALL_LAYOUT,
                           phi::BatchNormInferKernel,
                           float,
-                          phi::dtype::float16) {}
+                          double,
+                          phi::dtype::bfloat16,
+                          phi::dtype::float16) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16 ||
+      kernel_key.dtype() == phi::DataType::BFLOAT16) {
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+  }
+}
diff --git a/backends/metax_gpu/kernels/cuda_kernels/conv_transpose_grad_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/conv_transpose_grad_kernel_register.cu
deleted file mode 100644
index dacced51df4..00000000000
--- a/backends/metax_gpu/kernels/cuda_kernels/conv_transpose_grad_kernel_register.cu
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/gpu/conv_transpose_grad_kernel.cu"  // NOLINT
-PD_CUSTOM_KERNEL_REGISTER(conv2d_transpose_grad,
-                          metax_gpu,
-                          ALL_LAYOUT,
-                          phi::Conv2dTransposeGradKernel,
-                          float,
-                          double) {}
-PD_CUSTOM_KERNEL_REGISTER(conv2d_transpose_double_grad,
-                          metax_gpu,
-                          ALL_LAYOUT,
-                          phi::Conv2dTransposeDoubleGradKernel,
-                          float,
-                          double) {}
-PD_CUSTOM_KERNEL_REGISTER(conv3d_transpose_grad,
-                          metax_gpu,
-                          ALL_LAYOUT,
-                          phi::Conv3dTransposeGradKernel,
-                          float,
-                          double) {}
-PD_CUSTOM_KERNEL_REGISTER(depthwise_conv2d_transpose_grad,
-                          metax_gpu,
-                          ALL_LAYOUT,
-                          phi::DepthwiseConv2dTransposeGradKernel,
-                          float,
-                          double) {}
diff --git a/backends/metax_gpu/kernels/gpudnn/conv_transpose_grad_kernel_register.cu b/backends/metax_gpu/kernels/gpudnn/conv_transpose_grad_kernel_register.cu
new file mode 100644
index 00000000000..0067818d165
--- /dev/null
+++ b/backends/metax_gpu/kernels/gpudnn/conv_transpose_grad_kernel_register.cu
@@ -0,0 +1,1114 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+
+#include "kernels/gpudnn/conv_cudnn_v7.h"
+#include "kernels/metax_context.h"
+#include "paddle/common/ddim.h"
+#include "paddle/phi/backends/context_pool.h"
+#include "paddle/phi/backends/dynload/cudnn.h"
+#include "paddle/phi/backends/gpu/cuda/cudnn_helper.h"
+#include "paddle/phi/backends/gpu/cuda/cudnn_workspace_helper.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/conv_transpose_grad_kernel.h"
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/padding.h"
+#include "paddle/phi/kernels/funcs/slice.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+namespace phi {
+
+using GPUDNNDataLayout = phi::backends::gpu::DataLayout;
+
+template <typename T, typename Context>
+void ConvTransposeGradRawGPUDNNKernel(const Context& dev_ctx,
+                                      const DenseTensor& x,
+                                      const DenseTensor& filter,
+                                      const DenseTensor& dout,
+                                      const std::vector<int>& strides,
+                                      const std::vector<int>& paddings,
+                                      const std::string& padding_algorithm,
+                                      int groups,
+                                      const std::vector<int>& dilations,
+                                      const std::string& data_format,
+                                      DenseTensor* dx,
+                                      DenseTensor* dfilter) {
+  // 0-size
+  if (x.numel() == 0) {
+    if (dx) dev_ctx.template Alloc<T>(dx);
+    if (dfilter) {
+      phi::Full<T, Context>(dev_ctx,
+                            phi::IntArray(common::vectorize(dfilter->dims())),
+                            0,
+                            dfilter);
+    }
+    return;
+  }
+  if (filter.numel() == 0) {
+    if (dfilter) dev_ctx.template Alloc<T>(dfilter);
+    if (dx) {
+      phi::Full<T, Context>(
+          dev_ctx, phi::IntArray(common::vectorize(dx->dims())), 0, dx);
+    }
+    return;
+  }
+
+  const T* filter_data = filter.data<T>();
+  std::vector<int> paddings_ = paddings;
+  std::vector<int> dilations_ =
+      dilations;  // cudnn v5 does not support dilations
+  const GPUDNNDataLayout data_layout =
+      (data_format != "NHWC" ? GPUDNNDataLayout::kNCHW
+                             : GPUDNNDataLayout::kNHWC);
+
+  // if channel_last, transpose to channel_first
+  DenseTensor x_transpose;
+  DenseTensor dout_transpose;
+  std::vector<int> x_vec = common::vectorize<int>(x.dims());
+  std::vector<int> out_vec = common::vectorize<int>(dout.dims());
+  if (data_layout == GPUDNNDataLayout::kNHWC) {
+    if (strides.size() == 2U) {
+      std::vector<int> axis = {0, 3, 1, 2};
+      for (size_t i = 0; i < axis.size(); ++i) {
+        x_vec[i] = x.dims()[axis[i]];
+        out_vec[i] = dout.dims()[axis[i]];
+      }
+      x_transpose = Transpose<T, Context>(dev_ctx, x, axis);
+      dout_transpose = Transpose<T, Context>(dev_ctx, dout, axis);
+    } else if (strides.size() == 3U) {
+      std::vector<int> axis = {0, 4, 1, 2, 3};
+      for (size_t i = 0; i < axis.size(); ++i) {
+        x_vec[i] = x.dims()[axis[i]];
+        out_vec[i] = dout.dims()[axis[i]];
+      }
+      x_transpose = Transpose<T, Context>(dev_ctx, x, axis);
+      dout_transpose = Transpose<T, Context>(dev_ctx, dout, axis);
+    }
+  } else {
+    x_transpose = x;
+    dout_transpose = dout;
+  }
+
+  // update padding and dilation
+  auto x_dims = x_transpose.dims();
+  auto filter_dims = filter.dims();
+  DDim x_data_dims;
+  x_data_dims = slice_ddim(x_dims, 2, x_dims.size());
+  DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+  std::vector<int> ksize = common::vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation(
+      &paddings_, &dilations_, padding_algorithm, x_data_dims, strides, ksize);
+
+  int data_dim = strides.size();  // 2d or 3d
+  bool is_sys_pad = funcs::IsSymmetricPadding(paddings_, data_dim);
+
+  std::vector<int> x_pad(x_dims.size() * 2, 0);
+  DenseTensor transformed_dout;
+  std::vector<int> padding_common(data_dim, 0);
+  if (!is_sys_pad) {
+    std::vector<int> padding_diff(data_dim);
+    std::vector<int> new_dout_shape_vec(data_dim + 2);
+    new_dout_shape_vec[0] = dout_transpose.dims()[0];
+    new_dout_shape_vec[1] = dout_transpose.dims()[1];
+
+    for (size_t i = 0; i < data_dim; ++i) {
+      padding_diff[i] = std::abs(paddings_[2 * i] - paddings_[2 * i + 1]);
+      padding_common[i] = std::min(paddings_[2 * i], paddings_[2 * i + 1]);
+      new_dout_shape_vec[i + 2] =
+          dout_transpose.dims()[i + 2] + padding_diff[i];
+      x_pad[2 * i + 4] = paddings_[2 * i] - padding_common[i];
+      x_pad[2 * i + 4 + 1] = paddings_[2 * i + 1] - padding_common[i];
+    }
+
+    transformed_dout.Resize(common::make_ddim(new_dout_shape_vec));
+    dev_ctx.template Alloc<T>(&transformed_dout);
+
+    const int rank = x_transpose.dims().size();
+    T pad_value(0.0);
+    switch (rank) {
+      case 4: {
+        funcs::PadFunction<Context, T, 4>(
+            dev_ctx, x_pad, dout_transpose, pad_value, &transformed_dout);
+      } break;
+      case 5: {
+        funcs::PadFunction<Context, T, 5>(
+            dev_ctx, x_pad, dout_transpose, pad_value, &transformed_dout);
+      } break;
+      default:
+        PADDLE_THROW(errors::InvalidArgument(
+            "Op(ConvTranspose) only supports 4-D or 5-D x DenseTensor."));
+    }
+  } else {
+    transformed_dout = dout_transpose;
+    if (paddings_.size() == data_dim) {
+      for (size_t i = 0; i < data_dim; ++i) {
+        padding_common[i] = paddings_[i];
+      }
+    } else {
+      for (size_t i = 0; i < data_dim; ++i) {
+        padding_common[i] = paddings_[2 * i];
+      }
+    }
+  }
+
+  const T* x_data = x_transpose.data<T>();
+  const T* dout_data = transformed_dout.data<T>();
+  out_vec = common::vectorize<int>(transformed_dout.dims());
+
+  // ------------------- cudnn descriptors ---------------------
+#ifndef PADDLE_WITH_HIP
+  CUDNN_ENFORCE_TENSOR_SIZE_SUPPORTED(transformed_dout);
+  CUDNN_ENFORCE_TENSOR_SIZE_SUPPORTED(filter);
+  CUDNN_ENFORCE_TENSOR_SIZE_SUPPORTED(x_transpose);
+#endif
+
+  GPUDNNDataLayout layout;
+
+  if (strides.size() == 2U) {
+    layout = GPUDNNDataLayout::kNCHW;
+  } else {
+    layout = GPUDNNDataLayout::kNCDHW;
+  }
+
+  int iwo_groups = groups;
+  int c_groups = 1;
+#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
+  iwo_groups = 1;
+  c_groups = groups;
+  groups = 1;
+#endif
+
+  auto dtype = phi::backends::gpu::CudnnDataType<T>::type;
+  // auto handle = dev_ctx.cudnn_handle();
+  auto handle = GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace());
+  ConvArgs args1{handle,
+                 &transformed_dout,
+                 &filter,
+                 &x_transpose,
+                 strides,
+                 padding_common,
+                 dilations_,
+                 dtype,
+                 groups,
+                 layout};
+  ConvArgs args2{handle,
+                 &transformed_dout,
+                 &filter,
+                 &x_transpose,
+                 strides,
+                 padding_common,
+                 dilations_,
+                 dtype,
+                 groups,
+                 layout};
+
+#ifdef PADDLE_WITH_HIP
+  SearchResult<miopenConvFwdAlgorithm_t> fwd_result;
+  SearchResult<miopenConvBwdWeightsAlgorithm_t> filter_result;
+#else
+  SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result;
+  SearchResult<cudnnConvolutionBwdFilterAlgo_t> filter_result;
+#endif
+
+  auto layout_tensor = phi::backends::gpu::GetCudnnTensorFormat(layout);
+  size_t workspace_size = 0;
+  bool deterministic = FLAGS_cudnn_deterministic;
+  T* dx_data = nullptr;
+  T* dfilter_data = nullptr;
+
+  if (dx) {
+    dx_data = dev_ctx.template Alloc<T>(dx);
+
+    args1.idesc.set(transformed_dout, iwo_groups);
+    args1.wdesc.set(filter, layout_tensor, iwo_groups);
+    args1.odesc.set(x_transpose, iwo_groups);
+    args1.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations_,
+                    phi::AllowTF32Cudnn(),
+                    c_groups);
+#ifdef PADDLE_WITH_HIP
+    using search1 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
+    workspace_size = std::max(workspace_size, search1::GetWorkspaceSize(args1));
+    fwd_result.algo =
+        search1::Find<T>(args1, false, deterministic, workspace_size, dev_ctx);
+#else
+    using search1 = SearchAlgorithm<ConvKind::kForward>;
+    fwd_result = search1::Find<T>(dev_ctx, args1, false, deterministic, false);
+    workspace_size = std::max(
+        workspace_size, search1::GetWorkspaceSize(args1, fwd_result.algo));
+#endif
+  }
+
+  if (dfilter) {
+    dfilter_data = dev_ctx.template Alloc<T>(dfilter);
+
+    args2.idesc.set(transformed_dout, iwo_groups);
+    args2.wdesc.set(*dfilter, layout_tensor, iwo_groups);
+    args2.odesc.set(x_transpose, iwo_groups);
+    args2.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations_,
+                    phi::AllowTF32Cudnn(),
+                    c_groups);
+#ifdef PADDLE_WITH_HIP
+    using search2 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
+    workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2));
+    filter_result.algo =
+        search2::Find<T>(args2, false, deterministic, workspace_size, dev_ctx);
+#else
+    using search2 = SearchAlgorithm<ConvKind::kBackwardFilter>;
+    filter_result =
+        search2::Find<T>(dev_ctx, args2, false, deterministic, false);
+    workspace_size = std::max(
+        workspace_size, search2::GetWorkspaceSize(args2, filter_result.algo));
+#endif
+  }
+
+  // ------------------- cudnn conv backward data ---------------------
+  // FIxME(typhoonzero): template type T may not be the same as cudnn call.
+  int x_offset = x.numel() / x.dims()[0] / groups;
+  int dout_offset =
+      transformed_dout.numel() / transformed_dout.dims()[0] / groups;
+  int filter_offset = filter.numel() / groups;
+  ScalingParamType<T> alpha = 1.0f;
+  ScalingParamType<T> beta = 0.0f;
+  // auto workspace_handle = dev_ctx.cudnn_workspace_handle();
+  auto workspace_handle = GetDnnWorkspace(
+      const_cast<Allocator*>(&(dev_ctx.GetAllocator())), dev_ctx.stream());
+  if (dx) {
+#ifdef PADDLE_WITH_HIP
+    // Because beta is zero, it is unnecessary to reset dx.
+    for (int g = 0; g < groups; g++) {
+      auto cudnn_func = [&](void* cudnn_workspace) {
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            dynload::miopenConvolutionForward(handle,
+                                              &alpha,
+                                              args1.idesc.desc(),
+                                              dout_data + dout_offset * g,
+                                              args1.wdesc.desc(),
+                                              filter_data + filter_offset * g,
+                                              args1.cdesc.desc(),
+                                              fwd_result.algo,
+                                              &beta,
+                                              args1.odesc.desc(),
+                                              dx_data + x_offset * g,
+                                              cudnn_workspace,
+                                              workspace_size));
+      };
+      workspace_handle.RunFunc(cudnn_func, workspace_size);
+    }
+#else   // PADDLE_WITH_HIP
+    ConvRunner<T, ConvKind::kForward>::Apply(dev_ctx,
+                                             args1,
+                                             fwd_result,
+                                             dout_data,
+                                             filter_data,
+                                             dx_data,
+                                             groups,
+                                             dout_offset,
+                                             filter_offset,
+                                             x_offset,
+                                             workspace_size,
+                                             &workspace_handle,
+                                             false);
+#endif  // PADDLE_WITH_HIP
+
+    if (data_layout == GPUDNNDataLayout::kNHWC) {
+      DenseTensor dx_transpose;
+      DenseTensor dx_nchw;
+      dx_nchw.ShareDataWith(*dx);
+      dx_nchw.Resize(common::make_ddim(x_vec));
+      if (strides.size() == 2U) {
+        std::vector<int> axis = {0, 2, 3, 1};
+        dx_transpose = Transpose<T, Context>(dev_ctx, dx_nchw, axis);
+        *dx = dx_transpose;
+      } else if (strides.size() == 3U) {
+        std::vector<int> axis = {0, 2, 3, 4, 1};
+        dx_transpose = Transpose<T, Context>(dev_ctx, dx_nchw, axis);
+        *dx = dx_transpose;
+      }
+    }
+  }
+
+  // ------------------- cudnn conv backward filter ---------------------
+  if (dfilter) {
+    // Because beta is zero, it is unnecessary to reset dfilter.
+    // Gradient with respect to the filter
+#ifdef PADDLE_WITH_HIP
+    for (int g = 0; g < groups; g++) {
+      auto cudnn_func = [&](void* cudnn_workspace) {
+        PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionBackwardWeights(
+            handle,
+            &alpha,
+            args2.odesc.desc(),
+            x_data + x_offset * g,
+            args2.idesc.desc(),
+            dout_data + dout_offset * g,
+            args2.cdesc.desc(),
+            filter_result.algo,
+            &beta,
+            args2.wdesc.desc(),
+            dfilter_data + filter_offset * g,
+            cudnn_workspace,
+            workspace_size));
+      };
+      workspace_handle.RunFunc(cudnn_func, workspace_size);
+    }
+#else   // PADDLE_WITH_HIP
+    ConvRunner<T, ConvKind::kBackwardFilter>::Apply(dev_ctx,
+                                                    args2,
+                                                    filter_result,
+                                                    x_data,
+                                                    dout_data,
+                                                    dfilter_data,
+                                                    groups,
+                                                    dout_offset,
+                                                    filter_offset,
+                                                    x_offset,
+                                                    workspace_size,
+                                                    &workspace_handle,
+                                                    false);
+#endif  // PADDLE_WITH_HIP
+  }
+}
+
+template <typename T, typename Context>
+void Conv2dTransposeGradGPUDNNKernel(const Context& dev_ctx,
+                                     const DenseTensor& x,
+                                     const DenseTensor& filter,
+                                     const DenseTensor& dout,
+                                     const std::vector<int>& strides,
+                                     const std::vector<int>& paddings_,
+                                     const std::vector<int>& output_padding,
+                                     const IntArray& output_size,
+                                     const std::string& padding_algorithm,
+                                     int groups,
+                                     const std::vector<int>& dilations_,
+                                     const std::string& data_format,
+                                     DenseTensor* dx,
+                                     DenseTensor* dfilter) {
+  ConvTransposeGradRawGPUDNNKernel<T, Context>(dev_ctx,
+                                               x,
+                                               filter,
+                                               dout,
+                                               strides,
+                                               paddings_,
+                                               padding_algorithm,
+                                               groups,
+                                               dilations_,
+                                               data_format,
+                                               dx,
+                                               dfilter);
+}
+
+/*
+ * Inputs:  I, filter, dout, ddI, ddfilter
+ * Outputs: ddout, dfilter, dI
+ * ddo = conv_bp_data(filter, ddI) + conv_bp_data(ddfilter, I)
+ * dfilter = conv_bp_filter(dout, ddI)
+ * dI = conv(dout, ddfilter)
+ */
+template <typename T, typename Context>
+void Conv2dTransposeDoubleGradGPUDNNKernel(
+    const Context& dev_ctx,
+    const DenseTensor& x,
+    const DenseTensor& filter,
+    const DenseTensor& dout,
+    const DenseTensor& ddx,
+    const DenseTensor& ddfilter,
+    const std::vector<int>& strides,
+    const std::vector<int>& paddings,
+    const std::vector<int>& output_padding,
+    const IntArray& output_size,
+    const std::string& padding_algorithm,
+    int groups,
+    const std::vector<int>& dilations,
+    const std::string& data_format,
+    DenseTensor* dx,
+    DenseTensor* dfilter,
+    DenseTensor* ddout) {
+  if (dx) {
+    dev_ctx.template Alloc<T>(dx);
+  }
+  if (dfilter) {
+    dev_ctx.template Alloc<T>(dfilter);
+  }
+  if (ddout) {
+    dev_ctx.template Alloc<T>(ddout);
+    funcs::SetConstant<Context, T> set_zero;
+    set_zero(dev_ctx, ddout, static_cast<T>(0));
+  }
+
+  const T* filter_ = filter.data<T>();
+  const T* dout_ = dout.data<T>();
+  const T* ddx_ = nullptr;
+  const T* ddfilter_ = nullptr;
+  T* dx_ = nullptr;
+  T* dfilter_ = nullptr;
+  T* ddout_ = nullptr;
+  T* transformed_dx_ = nullptr;
+
+  std::vector<int> paddings_ = paddings;
+  std::vector<int> dilations_ = dilations;
+
+  bool deterministic = FLAGS_cudnn_deterministic;
+  const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+
+  // transform DenseTensors to channel first-----------
+  DenseTensor transformed_x_channel(x.type());
+  DenseTensor transformed_dout_channel(dout.type());
+  DenseTensor transformed_ddx_channel(x.type());
+
+  DenseTensor transformed_dx_channel(x.type());
+  DenseTensor transformed_ddout_channel(dout.type());
+
+  if (channel_last) {
+    ResizeToChannelFirst<Context, T>(dev_ctx, &x, &transformed_x_channel);
+    TransToChannelFirst<Context, T>(dev_ctx, &x, &transformed_x_channel);
+
+    ResizeToChannelFirst<Context, T>(dev_ctx, &dout, &transformed_dout_channel);
+    TransToChannelFirst<Context, T>(dev_ctx, &dout, &transformed_dout_channel);
+
+    ResizeToChannelFirst<Context, T>(dev_ctx, &ddx, &transformed_ddx_channel);
+    TransToChannelFirst<Context, T>(dev_ctx, &ddx, &transformed_ddx_channel);
+
+    if (dx) {
+      ResizeToChannelFirst<Context, T>(dev_ctx, dx, &transformed_dx_channel);
+      dev_ctx.template Alloc<T>(&transformed_dx_channel);
+    }
+    if (ddout) {
+      ResizeToChannelFirst<Context, T>(
+          dev_ctx, ddout, &transformed_ddout_channel);
+    }
+  } else {
+    transformed_x_channel = x;
+    transformed_dout_channel = dout;
+    transformed_ddx_channel = ddx;
+
+    if (dx) {
+      transformed_dx_channel = *dx;
+    }
+  }
+  std::vector<int> out_vec =
+      common::vectorize<int>(transformed_dout_channel.dims());
+
+  auto x_dims = transformed_x_channel.dims();
+  auto filter_dims = filter.dims();
+  DDim x_data_dims = slice_ddim(x_dims, 2, x_dims.size());
+  DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+  std::vector<int> ksize = common::vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation(
+      &paddings_, &dilations_, padding_algorithm, x_data_dims, strides, ksize);
+
+  int data_dim = strides.size();  // 2d or 3d
+  bool is_sys_pad = funcs::IsSymmetricPadding(paddings_, data_dim);
+  DenseTensor transformed_x(x.type());
+  DenseTensor transformed_ddx(x.type());
+
+  DenseTensor transformed_dout(dout.type());
+
+  std::vector<int> padding_common(data_dim, 0);
+  std::vector<int> input_pad(x.dims().size() * 2, 0);
+
+  if (!is_sys_pad) {
+    // get pad
+    std::vector<int> padding_diff(data_dim);
+    std::vector<int> new_input_shape_vec(data_dim + 2);
+    std::vector<int> new_output_grad_shape_vec(data_dim + 2);
+
+    new_input_shape_vec[0] = transformed_x_channel.dims()[0];
+    new_input_shape_vec[1] = transformed_x_channel.dims()[1];
+
+    new_output_grad_shape_vec[0] = transformed_dout_channel.dims()[0];
+    new_output_grad_shape_vec[1] = transformed_dout_channel.dims()[1];
+
+    for (size_t i = 0; i < data_dim; ++i) {
+      padding_diff[i] = std::abs(paddings_[2 * i] - paddings_[2 * i + 1]);
+      padding_common[i] = std::min(paddings_[2 * i], paddings_[2 * i + 1]);
+      new_input_shape_vec[i + 2] =
+          transformed_x_channel.dims()[i + 2] + padding_diff[i];
+
+      new_output_grad_shape_vec[i + 2] =
+          transformed_dout_channel.dims()[i + 2] + padding_diff[i];
+
+      input_pad[2 * i + 4] = paddings_[2 * i] - padding_common[i];
+      input_pad[2 * i + 4 + 1] = paddings_[2 * i + 1] - padding_common[i];
+    }
+    DDim new_input_shape(common::make_ddim(new_input_shape_vec));
+    transformed_x.Resize(new_input_shape);
+    transformed_ddx.Resize(new_input_shape);
+    transformed_dout.Resize(common::make_ddim(new_output_grad_shape_vec));
+
+    dev_ctx.template Alloc<T>(&transformed_x);
+    dev_ctx.template Alloc<T>(&transformed_ddx);
+    dev_ctx.template Alloc<T>(&transformed_dout);
+
+    // pad for input
+    const int rank = x.dims().size();
+    T pad_value(0.0);
+    switch (rank) {
+      case 4: {
+        funcs::PadFunction<Context, T, 4>(dev_ctx,
+                                          input_pad,
+                                          transformed_x_channel,
+                                          pad_value,
+                                          &transformed_x);
+        funcs::PadFunction<Context, T, 4>(dev_ctx,
+                                          input_pad,
+                                          transformed_dout_channel,
+                                          pad_value,
+                                          &transformed_dout);
+        funcs::PadFunction<Context, T, 4>(dev_ctx,
+                                          input_pad,
+                                          transformed_ddx_channel,
+                                          pad_value,
+                                          &transformed_ddx);
+      } break;
+      case 5: {
+        funcs::PadFunction<Context, T, 5>(dev_ctx,
+                                          input_pad,
+                                          transformed_x_channel,
+                                          pad_value,
+                                          &transformed_x);
+        funcs::PadFunction<Context, T, 5>(dev_ctx,
+                                          input_pad,
+                                          transformed_ddx_channel,
+                                          pad_value,
+                                          &transformed_ddx);
+      } break;
+      default:
+        PADDLE_THROW(errors::InvalidArgument(
+            "ConvOp only support tensors with 4 or 5 dimensions."));
+    }
+  } else {
+    transformed_x = transformed_x_channel;
+    transformed_dout = transformed_dout_channel;
+    transformed_ddx = transformed_ddx_channel;
+
+    if (paddings_.size() == data_dim) {
+      for (size_t i = 0; i < data_dim; ++i) {
+        padding_common[i] = paddings_[i];
+      }
+    } else {
+      for (size_t i = 0; i < data_dim; ++i) {
+        padding_common[i] = paddings_[2 * i];
+      }
+    }
+  }
+
+  std::vector<int64_t> starts(data_dim, 0);
+  std::vector<int64_t> ends(data_dim, 0);
+  std::vector<int64_t> axes(data_dim, 0);
+  for (size_t i = 0; i < data_dim; ++i) {
+    starts[i] = input_pad[2 * i + 4] * (strides[i] + 1);
+    ends[i] = starts[i] + out_vec[i + 2];
+    axes[i] = i + 2;
+  }
+
+  std::vector<int> transformed_out_vec = out_vec;
+  for (size_t i = 0; i < data_dim; ++i) {
+    transformed_out_vec[i + 2] =
+        out_vec[i + 2] +
+        (input_pad[2 * i + 4] + input_pad[2 * i + 5]) * strides[i] -
+        2 * padding_common[i] + paddings_[2 * i] + paddings_[2 * i + 1];
+  }
+
+  if (!is_sys_pad) {
+    transformed_ddout_channel.Resize(common::make_ddim(transformed_out_vec));
+    dev_ctx.template Alloc<T>(&transformed_ddout_channel);
+  } else {
+    dev_ctx.template Alloc<T>(ddout);
+    transformed_ddout_channel = *ddout;
+    transformed_ddout_channel.Resize(common::make_ddim(transformed_out_vec));
+  }
+
+  const T* x_ = transformed_x.data<T>();
+
+  int iwo_group = groups;
+  int c_group = 1;
+#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
+  iwo_group = 1;
+  c_group = groups;
+  groups = 1;
+#endif
+  auto dtype = phi::backends::gpu::CudnnDataType<T>::type;
+
+  // auto handle = dev_ctx.cudnn_handle();
+  auto handle = GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace());
+  auto layout =
+      phi::backends::gpu::GetCudnnTensorFormat(GPUDNNDataLayout::kNCHW);
+
+  ConvArgs args1{handle,
+                 &transformed_ddout_channel,
+                 &filter,
+                 &transformed_ddx,
+                 strides,
+                 padding_common,
+                 dilations_,
+                 dtype,
+                 groups,
+                 GPUDNNDataLayout::kNCHW};
+  ConvArgs args2{handle,
+                 &transformed_ddout_channel,
+                 &ddfilter,
+                 &transformed_x,
+                 strides,
+                 padding_common,
+                 dilations_,
+                 dtype,
+                 groups,
+                 GPUDNNDataLayout::kNCHW};
+
+  ConvArgs args3{handle,
+                 &transformed_dout,
+                 dfilter,
+                 &transformed_ddx_channel,
+                 strides,
+                 padding_common,
+                 dilations_,
+                 dtype,
+                 groups,
+                 GPUDNNDataLayout::kNCHW};
+  ConvArgs args4{handle,
+                 &transformed_dout,
+                 &ddfilter,
+                 &transformed_dx_channel,
+                 strides,
+                 padding_common,
+                 dilations_,
+                 dtype,
+                 groups,
+                 GPUDNNDataLayout::kNCHW};
+#ifdef PADDLE_WITH_HIP
+  SearchResult<miopenConvBwdDataAlgorithm_t> bwd_result1;
+  SearchResult<miopenConvBwdDataAlgorithm_t> bwd_result2;
+  SearchResult<miopenConvBwdWeightsAlgorithm_t> filter_result;
+  SearchResult<miopenConvFwdAlgorithm_t> fwd_result;
+#else
+  SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result1;
+  SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result2;
+  SearchResult<cudnnConvolutionBwdFilterAlgo_t> filter_result;
+  SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result;
+#endif
+
+  // ddo = conv(ddI, filter) + conv(I, ddfilter)
+  size_t workspace_size = 0;
+
+  T* transformed_ddout_channel_ = nullptr;
+
+  if (ddout) {
+    ddout_ = ddout->data<T>();
+    transformed_ddout_channel_ = transformed_ddout_channel.data<T>();
+
+    args1.idesc.set(transformed_ddout_channel, iwo_group);
+    args1.wdesc.set(filter, layout, iwo_group);
+    args1.odesc.set(transformed_ddx, iwo_group);
+    args1.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations_,
+                    phi::AllowTF32Cudnn(),
+                    c_group);
+#ifdef PADDLE_WITH_HIP
+    using search1 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
+    workspace_size = search1::GetWorkspaceSize(args1);
+    bwd_result1.algo =
+        search1::Find<T>(args1, false, deterministic, workspace_size, dev_ctx);
+#else
+    using search1 = SearchAlgorithm<ConvKind::kBackwardData>;
+    bwd_result1 = search1::Find<T>(dev_ctx, args1, false, deterministic, false);
+    workspace_size = search1::GetWorkspaceSize(args1, bwd_result1.algo);
+#endif
+
+    ddfilter_ = ddfilter.data<T>();
+    args2.handle = handle;
+    args2.idesc.set(transformed_ddout_channel, iwo_group);
+    args2.wdesc.set(ddfilter, layout, iwo_group);
+    args2.odesc.set(transformed_x, iwo_group);
+    args2.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations_,
+                    phi::AllowTF32Cudnn(),
+                    c_group);
+#ifdef PADDLE_WITH_HIP
+    using search2 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
+    workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2));
+    bwd_result2.algo =
+        search2::Find<T>(args2, false, deterministic, workspace_size, dev_ctx);
+#else
+    using search2 = SearchAlgorithm<ConvKind::kBackwardData>;
+    bwd_result2 = search2::Find<T>(dev_ctx, args2, false, deterministic, false);
+    workspace_size = std::max(
+        workspace_size, search2::GetWorkspaceSize(args2, bwd_result2.algo));
+#endif
+  }
+
+  if (dfilter) {
+    dfilter_ = dfilter->data<T>();
+
+    args3.idesc.set(transformed_dout, iwo_group);
+    args3.wdesc.set(*dfilter, layout, iwo_group);
+    args3.odesc.set(transformed_ddx_channel, iwo_group);
+    args3.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations_,
+                    phi::AllowTF32Cudnn(),
+                    c_group);
+#ifdef PADDLE_WITH_HIP
+    using search3 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
+    workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3));
+    filter_result.algo =
+        search3::Find<T>(args3, false, deterministic, workspace_size, dev_ctx);
+#else
+    using search3 = SearchAlgorithm<ConvKind::kBackwardFilter>;
+    filter_result =
+        search3::Find<T>(dev_ctx, args3, false, deterministic, false);
+    workspace_size = std::max(
+        workspace_size, search3::GetWorkspaceSize(args3, filter_result.algo));
+#endif
+  }
+
+  if (dx) {
+    transformed_dx_ = transformed_dx_channel.data<T>();
+
+    args4.handle = handle;
+    args4.idesc.set(transformed_dout, iwo_group);
+    args4.wdesc.set(ddfilter, layout, iwo_group);
+    args4.odesc.set(transformed_dx_channel, iwo_group);
+    args4.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations_,
+                    phi::AllowTF32Cudnn(),
+                    c_group);
+#ifdef PADDLE_WITH_HIP
+    using search4 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
+    workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4));
+    fwd_result.algo =
+        search4::Find<T>(args4, false, deterministic, workspace_size, dev_ctx);
+#else
+    using search4 = SearchAlgorithm<ConvKind::kForward>;
+    fwd_result = search4::Find<T>(dev_ctx, args4, false, deterministic, false);
+    workspace_size = std::max(
+        workspace_size, search4::GetWorkspaceSize(args4, fwd_result.algo));
+#endif
+  }
+
+  int i_n, i_c, i_d, i_h, i_w;
+  GetNCDHW(transformed_x.dims(),
+           GPUDNNDataLayout::kNCHW,
+           &i_n,
+           &i_c,
+           &i_d,
+           &i_h,
+           &i_w);
+
+  int o_n, o_c, o_d, o_h, o_w;
+  GetNCDHW(transformed_dout.dims(),
+           GPUDNNDataLayout::kNCHW,
+           &o_n,
+           &o_c,
+           &o_d,
+           &o_h,
+           &o_w);
+
+  int group_offset_in =
+      transformed_x.numel() / transformed_x.dims()[0] / groups;
+  int group_offset_out =
+      transformed_dout.numel() / transformed_dout.dims()[0] / groups;
+  int group_offset_filter = filter.numel() / groups;
+
+  ScalingParamType<T> alpha = 1.0f;
+  ScalingParamType<T> beta = 0.0f;
+
+  // auto workspace_handle = dev_ctx.cudnn_workspace_handle();
+  auto workspace_handle = GetDnnWorkspace(
+      const_cast<Allocator*>(&(dev_ctx.GetAllocator())), dev_ctx.stream());
+  if (ddout) {
+    ddx_ = transformed_ddx.data<T>();
+#ifdef PADDLE_WITH_HIP
+    for (int i = 0; i < groups; i++) {
+      workspace_handle.RunFunc(
+          [&](void* workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionBackwardData(
+                handle,
+                &alpha,
+                args1.odesc.desc(),
+                ddx_ + i * group_offset_in,
+                args1.wdesc.desc(),
+                filter_ + i * group_offset_filter,
+                args1.cdesc.desc(),
+                bwd_result1.algo,
+                &beta,
+                args1.idesc.desc(),
+                transformed_ddout_channel_ + i * group_offset_out,
+                workspace_ptr,
+                workspace_size));
+          },
+          workspace_size);
+    }
+#else   // PADDLE_WITH_HIP
+    ConvRunner<T, ConvKind::kBackwardData>::Apply(dev_ctx,
+                                                  args1,
+                                                  bwd_result1,
+                                                  ddx_,
+                                                  filter_,
+                                                  transformed_ddout_channel_,
+                                                  groups,
+                                                  group_offset_out,
+                                                  group_offset_filter,
+                                                  group_offset_in,
+                                                  workspace_size,
+                                                  &workspace_handle,
+                                                  false);
+#endif  // PADDLE_WITH_HIP
+
+#ifdef PADDLE_WITH_HIP
+    for (int i = 0; i < groups; i++) {
+      // MIOPEN ONLY support beta to be 0.0f
+      DenseTensor conv_x_ddfilter(dout.type());
+      conv_x_ddfilter.Resize(transformed_ddout_channel.dims());
+      T* conv_x_ddfilter_data = dev_ctx.template Alloc<T>(&conv_x_ddfilter);
+      workspace_handle.RunFunc(
+          [&](void* workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionBackwardData(
+                handle,
+                &alpha,
+                args2.odesc.desc(),
+                x_ + i * group_offset_in,
+                args2.wdesc.desc(),
+                ddfilter_ + i * group_offset_filter,
+                args2.cdesc.desc(),
+                bwd_result2.algo,
+                &beta,
+                args2.idesc.desc(),
+                conv_x_ddfilter_data + i * group_offset_out,
+                workspace_ptr,
+                workspace_size));
+          },
+          workspace_size);
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenOpTensor(
+          handle,
+          miopenTensorOpAdd,
+          &alpha,
+          args2.idesc.desc(),
+          transformed_ddout_channel_ + i * group_offset_out,
+          &alpha,
+          args2.idesc.desc(),
+          conv_x_ddfilter_data + i * group_offset_out,
+          &beta,
+          args2.idesc.desc(),
+          transformed_ddout_channel_ + i * group_offset_out));
+    }
+#else   // PADDLE_WITH_HIP
+    ConvRunner<T, ConvKind::kBackwardData>::Apply(dev_ctx,
+                                                  args2,
+                                                  bwd_result2,
+                                                  x_,
+                                                  ddfilter_,
+                                                  transformed_ddout_channel_,
+                                                  groups,
+                                                  group_offset_out,
+                                                  group_offset_filter,
+                                                  group_offset_in,
+                                                  workspace_size,
+                                                  &workspace_handle,
+                                                  true);
+#endif  // PADDLE_WITH_HIP
+
+    if ((!is_sys_pad) && (!channel_last)) {
+      if (strides.size() == 2U) {
+        funcs::Slice<Context, T, 4>(
+            dev_ctx, &transformed_ddout_channel, ddout, starts, ends, axes);
+      } else if (!is_sys_pad && strides.size() == 3U) {
+        funcs::Slice<Context, T, 5>(
+            dev_ctx, &transformed_ddout_channel, ddout, starts, ends, axes);
+      }
+    } else if ((!is_sys_pad) && (channel_last)) {
+      if (strides.size() == 2U) {
+        funcs::Slice<Context, T, 4>(dev_ctx,
+                                    &transformed_ddout_channel,
+                                    &transformed_ddout_channel,
+                                    starts,
+                                    ends,
+                                    axes);
+      } else if (!is_sys_pad && strides.size() == 3U) {
+        funcs::Slice<Context, T, 5>(dev_ctx,
+                                    &transformed_ddout_channel,
+                                    &transformed_ddout_channel,
+                                    starts,
+                                    ends,
+                                    axes);
+      }
+
+      TransToChannelLast<Context, T>(
+          dev_ctx, &transformed_ddout_channel, ddout);
+    }
+  }
+
+  T* transformed_dout_channel_ = transformed_dout.data<T>();
+  if (dfilter) {
+    ddx_ = transformed_ddx_channel.data<T>();
+#ifdef PADDLE_WITH_HIP
+    for (int i = 0; i < groups; i++) {
+      workspace_handle.RunFunc(
+          [&](void* workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(
+                dynload::miopenConvolutionBackwardWeights(
+                    handle,
+                    &alpha,
+                    args3.odesc.desc(),
+                    ddx_ + i * group_offset_in,
+                    args3.idesc.desc(),
+                    transformed_dout_channel_ + i * group_offset_out,
+                    args3.cdesc.desc(),
+                    filter_result.algo,
+                    &beta,
+                    args3.wdesc.desc(),
+                    dfilter_ + i * group_offset_filter,
+                    workspace_ptr,
+                    workspace_size));
+          },
+          workspace_size);
+    }
+#else   // PADDLE_WITH_HIP
+    ConvRunner<T, ConvKind::kBackwardFilter>::Apply(dev_ctx,
+                                                    args3,
+                                                    filter_result,
+                                                    ddx_,
+                                                    transformed_dout_channel_,
+                                                    dfilter_,
+                                                    groups,
+                                                    group_offset_out,
+                                                    group_offset_filter,
+                                                    group_offset_in,
+                                                    workspace_size,
+                                                    &workspace_handle,
+                                                    false);
+#endif  // PADDLE_WITH_HIP
+  }
+
+  if (dx) {
+    ddfilter_ = ddfilter.data<T>();
+#ifdef PADDLE_WITH_HIP
+    for (int i = 0; i < groups; i++) {
+      workspace_handle.RunFunc(
+          [&](void* workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionForward(
+                handle,
+                &alpha,
+                args4.idesc.desc(),
+                transformed_dout_channel_ + i * group_offset_out,
+                args4.wdesc.desc(),
+                ddfilter_ + i * group_offset_filter,
+                args4.cdesc.desc(),
+                fwd_result.algo,
+                &beta,
+                args4.odesc.desc(),
+                transformed_dx_ + i * group_offset_in,
+                workspace_ptr,
+                workspace_size));
+          },
+          workspace_size);
+    }
+#else   // PADDLE_WITH_HIP
+    ConvRunner<T, ConvKind::kForward>::Apply(dev_ctx,
+                                             args4,
+                                             fwd_result,
+                                             transformed_dout_channel_,
+                                             ddfilter_,
+                                             transformed_dx_,
+                                             groups,
+                                             group_offset_out,
+                                             group_offset_filter,
+                                             group_offset_in,
+                                             workspace_size,
+                                             &workspace_handle,
+                                             false);
+#endif  // PADDLE_WITH_HIP
+
+    if (channel_last) {
+      TransToChannelLast<Context, T>(dev_ctx, &transformed_dx_channel, dx);
+    }
+  }
+}
+
+template <typename T, typename Context>
+void Conv3dTransposeGradGPUDNNKernel(const Context& dev_ctx,
+                                     const DenseTensor& x,
+                                     const DenseTensor& filter,
+                                     const DenseTensor& dout,
+                                     const std::vector<int>& strides,
+                                     const std::vector<int>& paddings_,
+                                     const std::vector<int>& output_padding,
+                                     const std::vector<int>& output_size,
+                                     const std::string& padding_algorithm,
+                                     int groups,
+                                     const std::vector<int>& dilations_,
+                                     const std::string& data_format,
+                                     DenseTensor* dx,
+                                     DenseTensor* dfilter) {
+  ConvTransposeGradRawGPUDNNKernel<T, Context>(dev_ctx,
+                                               x,
+                                               filter,
+                                               dout,
+                                               strides,
+                                               paddings_,
+                                               padding_algorithm,
+                                               groups,
+                                               dilations_,
+                                               data_format,
+                                               dx,
+                                               dfilter);
+}
+
+}  // namespace phi
+
+using float16 = phi::dtype::float16;
+
+PD_REGISTER_PLUGIN_KERNEL(conv2d_transpose_grad,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::Conv2dTransposeGradGPUDNNKernel,
+                          float,
+                          double,
+                          float16,
+                          phi::dtype::bfloat16) {}
+PD_REGISTER_PLUGIN_KERNEL(conv2d_transpose_double_grad,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::Conv2dTransposeDoubleGradGPUDNNKernel,
+                          float,
+                          double,
+                          float16,
+                          phi::dtype::bfloat16) {}
+PD_REGISTER_PLUGIN_KERNEL(conv3d_transpose_grad,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::Conv3dTransposeGradGPUDNNKernel,
+                          float,
+                          double,
+                          float16,
+                          phi::dtype::bfloat16) {}
diff --git a/backends/metax_gpu/kernels/impl/spectral_norm_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/spectral_norm_grad_kernel_impl.h
deleted file mode 100644
index 03651be95c3..00000000000
--- a/backends/metax_gpu/kernels/impl/spectral_norm_grad_kernel_impl.h
+++ /dev/null
@@ -1,130 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "kernels/impl/spectral_norm_kernel_impl.h"
-
-namespace phi {
-
-template <typename T, typename Context>
-void SpectralNormGradKernel(const Context& dev_ctx,
-                            const DenseTensor& weight,
-                            const DenseTensor& u,
-                            const DenseTensor& v,
-                            const DenseTensor& out_grad,
-                            int dim,
-                            int power_iters,
-                            float eps,
-                            DenseTensor* weight_grad) {
-  auto& place = *dev_ctx.eigen_device();
-  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
-
-  const int h = u.dims()[0];
-  const int w = v.dims()[0];
-
-  DenseTensor weight_mat, out_grad_mat;
-  auto dims = weight.dims();
-  const int rank = dims.size();
-  std::vector<int> real_dims;
-  if (dim != 0) {
-    std::vector<int> perm;
-    perm.push_back(dim);
-    real_dims.push_back(dims[dim]);
-    for (int i = 0; i < rank; i++) {
-      if (i != dim) {
-        perm.push_back(i);
-        real_dims.push_back(dims[i]);
-      }
-    }
-    weight_mat.Resize(common::make_ddim(real_dims));
-    dev_ctx.template Alloc<T>(&weight_mat);
-    out_grad_mat.Resize(common::make_ddim(real_dims));
-    dev_ctx.template Alloc<T>(&out_grad_mat);
-    TransCompute2DTo5D<Context, T>(dev_ctx, weight, rank, perm, &weight_mat);
-    TransCompute2DTo5D<Context, T>(
-        dev_ctx, out_grad, rank, perm, &out_grad_mat);
-  } else {
-    for (int i = 0; i < rank; i++) {
-      real_dims.push_back(i);
-    }
-    phi::Copy(dev_ctx, weight, dev_ctx.GetPlace(), true, &weight_mat);
-    phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), true, &out_grad_mat);
-  }
-  weight_mat = weight_mat.Resize({h, w});
-  out_grad_mat = out_grad_mat.Resize({h, w});
-
-  DenseTensor sigma;
-  sigma.Resize(weight_mat.dims());
-  dev_ctx.template Alloc<T>(&sigma);
-  DenseTensor uu, vv;
-  phi::Copy(dev_ctx, u, dev_ctx.GetPlace(), true, &uu);
-  phi::Copy(dev_ctx, v, dev_ctx.GetPlace(), true, &vv);
-  CalcMatrixSigmaAndNormWeight<Context, T>(dev_ctx,
-                                           &weight_mat,
-                                           &(uu.Resize({h, 1})),
-                                           &(vv.Resize({w, 1})),
-                                           &sigma,
-                                           power_iters,
-                                           eps);
-
-  DenseTensor uv;
-  uv.Resize({h, w});
-  dev_ctx.template Alloc<T>(&uv);
-  blas.MatMul(
-      uu.Resize({h, 1}), false, vv.Resize({w, 1}), false, T(1), &uv, T(0));
-
-  DenseTensor weight_grad_mat;
-  weight_grad_mat.Resize({h, w});
-  dev_ctx.template Alloc<T>(&weight_grad_mat);
-  auto weight_grad_mat_t = EigenTensor<T, 2>::From(weight_grad_mat);
-  auto weight_mat_t = EigenTensor<T, 2>::From(weight_mat);
-  auto out_grad_mat_t = EigenTensor<T, 2>::From(out_grad_mat);
-  auto sigma_t = EigenTensor<T, 2>::From(sigma);
-  auto uv_t = EigenTensor<T, 2>::From(uv);
-  weight_mat_t.device(place) =
-      weight_mat_t.sum().eval().reshape(Array2(1, 1)).broadcast(Array2(h, w));
-  weight_grad_mat_t.device(place) =
-      out_grad_mat_t * (out_grad_mat_t.constant(1.0) - uv_t * weight_mat_t) /
-      sigma_t;
-
-  if (dim != 0) {
-    std::vector<int> perm;
-    for (int i = 0; i < rank; i++) {
-      if (i < dim) {
-        perm.push_back(i + 1);
-      } else if (i == dim) {
-        perm.push_back(0);
-      } else {
-        perm.push_back(i);
-      }
-    }
-    weight_grad->Resize(dims);
-    dev_ctx.template Alloc<T>(weight_grad);
-    TransCompute2DTo5D<Context, T>(
-        dev_ctx,
-        weight_grad_mat.Resize(common::make_ddim(real_dims)),
-        rank,
-        perm,
-        weight_grad);
-  } else {
-    phi::Copy(dev_ctx,
-              weight_grad_mat.Resize(dims),
-              dev_ctx.GetPlace(),
-              true,
-              weight_grad);
-  }
-}
-
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/impl/spectral_norm_kernel_impl.h b/backends/metax_gpu/kernels/impl/spectral_norm_kernel_impl.h
deleted file mode 100644
index 8c9fc548259..00000000000
--- a/backends/metax_gpu/kernels/impl/spectral_norm_kernel_impl.h
+++ /dev/null
@@ -1,182 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "kernels/funcs/blas/blas.h"
-#include "paddle/phi/core/tensor_utils.h"
-#include "paddle/phi/kernels/funcs/eigen/common.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace phi {
-
-using Array1 = Eigen::DSizes<int64_t, 1>;
-using Array2 = Eigen::DSizes<int64_t, 2>;
-using IndexPair = Eigen::IndexPair<int>;
-
-template <typename Context, typename T>
-static inline void TransCompute2DTo5D(const Context& dev_ctx,
-                                      const DenseTensor& in,
-                                      const int rank,
-                                      const std::vector<int>& perm,
-                                      DenseTensor* out) {
-  if (rank <= 1 || rank > 5) {
-    PADDLE_THROW(common::errors::Fatal(
-        "Weight rank of SpectralNorm should be in range [2, 5], but got %d.",
-        rank));
-  }
-
-  switch (rank) {
-    case 2:
-      phi::funcs::Transpose<Context, T, 2> trans2;
-      trans2(dev_ctx, in, out, perm);
-      break;
-    case 3:
-      phi::funcs::Transpose<Context, T, 3> trans3;
-      trans3(dev_ctx, in, out, perm);
-      break;
-    case 4:
-      phi::funcs::Transpose<Context, T, 4> trans4;
-      trans4(dev_ctx, in, out, perm);
-      break;
-    case 5:
-      phi::funcs::Transpose<Context, T, 5> trans5;
-      trans5(dev_ctx, in, out, perm);
-      break;
-    default:
-      break;
-  }
-}
-
-template <typename Context, typename T>
-static inline void CalcMatrixSigmaAndNormWeight(const Context& dev_ctx,
-                                                DenseTensor* weight,
-                                                DenseTensor* u,
-                                                DenseTensor* v,
-                                                DenseTensor* sigma,
-                                                const int power_iters,
-                                                const float eps) {
-  auto& place = *dev_ctx.eigen_device();
-  auto blas = funcs::GetBlas<Context, T>(dev_ctx);
-  auto sigma_t = EigenTensor<T, 2>::From(*sigma);
-  auto weight_t = EigenTensor<T, 2>::From(*weight);
-  auto u_t = EigenTensor<T, 2>::From(*u);
-  auto v_t = EigenTensor<T, 2>::From(*v);
-
-  const int h = weight->dims()[0];
-  const int w = weight->dims()[1];
-
-  for (int i = 0; i < power_iters; i++) {
-    // V = W^T * U / ||W^T * U||_2
-    blas.MatMul(*weight, true, *u, false, T(1), v, T(0));
-    auto v_t_norm =
-        v_t.square().sum().sqrt().eval().reshape(Array1(1)).broadcast(
-            Array1(w));
-    v_t.device(place) = v_t / (v_t_norm + v_t_norm.constant(eps));
-    // U = W^T * V / ||W^T * V||_2
-    blas.MatMul(*weight, false, *v, false, T(1), u, T(0));
-    auto u_t_norm =
-        u_t.square().sum().sqrt().eval().reshape(Array1(1)).broadcast(
-            Array1(h));
-    u_t.device(place) = u_t / (u_t_norm + u_t_norm.constant(eps));
-  }
-  DenseTensor weight_v;
-  weight_v.Resize({h, 1});
-  dev_ctx.template Alloc<T>(&weight_v);
-  blas.MatMul(*weight, false, *v, false, T(1), &weight_v, T(0));
-  auto weight_v_t = EigenTensor<T, 2>::From(weight_v);
-  sigma_t.device(place) = (u_t * weight_v_t)
-                              .sum()
-                              .eval()
-                              .reshape(Array2(1, 1))
-                              .broadcast(Array2(h, w));
-  weight_t.device(place) = weight_t / sigma_t;
-}
-
-template <typename T, typename Context>
-void SpectralNormKernel(const Context& dev_ctx,
-                        const DenseTensor& weight,
-                        const DenseTensor& u,
-                        const DenseTensor& v,
-                        int dim,
-                        int power_iters,
-                        float eps,
-                        DenseTensor* out) {
-  const int h = u.dims()[0];
-  const int w = v.dims()[0];
-
-  DenseTensor weight_mat;
-  auto dims = weight.dims();
-  const int rank = dims.size();
-  std::vector<int> real_dims;
-  if (dim != 0) {
-    std::vector<int> perm;
-    perm.push_back(dim);
-    real_dims.push_back(dims[dim]);
-    for (int i = 0; i < rank; i++) {
-      if (i != dim) {
-        perm.push_back(i);
-        real_dims.push_back(dims[i]);
-      }
-    }
-    weight_mat.Resize(common::make_ddim(real_dims));
-    dev_ctx.template Alloc<T>(&weight_mat);
-    TransCompute2DTo5D<Context, T>(dev_ctx, weight, rank, perm, &weight_mat);
-  } else {
-    for (int i = 0; i < rank; i++) {
-      real_dims.push_back(i);
-    }
-    phi::Copy(dev_ctx, weight, dev_ctx.GetPlace(), true, &weight_mat);
-  }
-  weight_mat = weight_mat.Resize({h, w});
-
-  DenseTensor sigma;
-  sigma.Resize(weight_mat.dims());
-  dev_ctx.template Alloc<T>(&sigma);
-  DenseTensor uu, vv;
-  phi::Copy(dev_ctx, u, dev_ctx.GetPlace(), true, &uu);
-  phi::Copy(dev_ctx, v, dev_ctx.GetPlace(), true, &vv);
-  CalcMatrixSigmaAndNormWeight<Context, T>(dev_ctx,
-                                           &weight_mat,
-                                           &(uu.Resize({h, 1})),
-                                           &(vv.Resize({w, 1})),
-                                           &sigma,
-                                           power_iters,
-                                           eps);
-
-  if (dim != 0) {
-    std::vector<int> perm;
-    for (int i = 0; i < rank; i++) {
-      if (i < dim) {
-        perm.push_back(i + 1);
-      } else if (i == dim) {
-        perm.push_back(0);
-      } else {
-        perm.push_back(i);
-      }
-    }
-    out->Resize(dims);
-    dev_ctx.template Alloc<T>(out);
-    TransCompute2DTo5D<Context, T>(
-        dev_ctx,
-        weight_mat.Resize(common::make_ddim(real_dims)),
-        rank,
-        perm,
-        out);
-  } else {
-    phi::Copy(dev_ctx, weight_mat.Resize(dims), dev_ctx.GetPlace(), true, out);
-  }
-}
-
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/metax_context.cc b/backends/metax_gpu/kernels/metax_context.cc
index 9bd26a170c5..4df4d88b0b4 100644
--- a/backends/metax_gpu/kernels/metax_context.cc
+++ b/backends/metax_gpu/kernels/metax_context.cc
@@ -15,6 +15,7 @@
 #include "kernels/metax_context.h"
 
 namespace phi {
+bool AllowTF32Cudnn() { return false; }
 void DnnWorkspaceHandle::RunFuncSync(
     const std::function<void(void*)>& cudnn_func,
     size_t required_workspace_bytes,
diff --git a/backends/metax_gpu/kernels/metax_context.h b/backends/metax_gpu/kernels/metax_context.h
index 21e9084a977..5974aadcc41 100644
--- a/backends/metax_gpu/kernels/metax_context.h
+++ b/backends/metax_gpu/kernels/metax_context.h
@@ -128,6 +128,7 @@ inline void InitCusolverDnHandle(cusolverDnHandle_t* handle,
   }
 }
 
+bool AllowTF32Cudnn();
 inline cusolverDnHandle_t GetCusolverDnHandle(gpuStream_t stream, Place place) {
   std::call_once(flag_cusolver_dn_, [&]() {
     if (!cusolver_dn_handle_) {
diff --git a/backends/metax_gpu/kernels/metax_kernel/instance_norm_grad_kerne_registerl.cu b/backends/metax_gpu/kernels/metax_kernel/instance_norm_grad_kerne_registerl.cu
new file mode 100644
index 00000000000..d7540d949a9
--- /dev/null
+++ b/backends/metax_gpu/kernels/metax_kernel/instance_norm_grad_kerne_registerl.cu
@@ -0,0 +1,650 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "glog/logging.h"
+#include "kernels/metax_context.h"
+#include "paddle/common/layout.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/norm_utils.h"
+#include "paddle/phi/kernels/gpu/instance_norm_utils.h"
+#include "paddle/phi/kernels/instance_norm_grad_kernel.h"
+
+namespace phi {
+template <typename T, int BlockDim>
+static __global__ void GradComputeDX(const T *dy,
+                                     const BatchNormParamType<T> *scale,
+                                     const BatchNormParamType<T> *mean,
+                                     const T *x,
+                                     const BatchNormParamType<T> *variance,
+                                     const int C,
+                                     const int sample_size,
+                                     T *dx) {
+  int beg_idx = blockIdx.x * sample_size + threadIdx.x;
+  int end_idx = (blockIdx.x + 1) * sample_size;
+  int ncid = blockIdx.x;
+  int c = ncid % C;
+  BatchNormParamType<T> mean_val = mean[ncid];
+  BatchNormParamType<T> inv_var_val = variance[ncid];
+  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage dy_storage;
+  __shared__ typename BlockReduce::TempStorage dy_x_sub_mean_storage;
+  __shared__ BatchNormParamType<T> dy_sum_val;
+  __shared__ BatchNormParamType<T> dy_x_sub_mean_sum_val;
+  BatchNormParamType<T> dy_sum = static_cast<BatchNormParamType<T>>(0);
+  BatchNormParamType<T> dy_x_sub_mean_sum =
+      static_cast<BatchNormParamType<T>>(0);
+
+  for (int i = beg_idx; i < end_idx; i += BlockDim) {
+    BatchNormParamType<T> dy_i = static_cast<BatchNormParamType<T>>(dy[i]);
+    dy_sum += dy_i;
+    dy_x_sub_mean_sum +=
+        dy_i * (static_cast<BatchNormParamType<T>>(x[i]) - mean_val);
+  }
+  dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
+  dy_x_sub_mean_sum =
+      BlockReduce(dy_x_sub_mean_storage).Reduce(dy_x_sub_mean_sum, cub::Sum());
+  if (threadIdx.x == 0) {
+    dy_sum_val = dy_sum;
+    dy_x_sub_mean_sum_val = dy_x_sub_mean_sum;
+  }
+  __syncthreads();
+  for (int i = beg_idx; i < end_idx; i += BlockDim) {
+    dx[i] = static_cast<T>(
+        (static_cast<BatchNormParamType<T>>(dy[i]) -
+         dy_sum_val / static_cast<BatchNormParamType<T>>(sample_size) -
+         (static_cast<BatchNormParamType<T>>(x[i]) - mean_val) *
+             dy_x_sub_mean_sum_val * inv_var_val * inv_var_val / sample_size) *
+        scale[c] * inv_var_val);
+  }
+}
+
+static __device__ __forceinline__ float real_sqrt(float x) {
+  return 1. / sqrtf(x);
+}
+static __device__ __forceinline__ double real_sqrt(double x) {
+  return 1. / sqrt(x);
+}
+
+template <typename T, typename AccT, int BlockDim>
+__global__ void DoubleGradComputeDX(const T *x,
+                                    const AccT *mean,
+                                    const AccT *variance,
+                                    const T *ddx,
+                                    const T *dy,
+                                    const AccT *scale,
+                                    const AccT *ddscale,
+                                    int C,
+                                    int sample_size,
+                                    const double epsilon,
+                                    T *dx) {
+  int beg_idx = blockIdx.x * sample_size + threadIdx.x;
+  int end_idx = (blockIdx.x + 1) * sample_size;
+  int ncid = blockIdx.x;
+  int c = ncid % C;
+
+  AccT mean_val = mean[ncid];
+  AccT var_val = variance[ncid];
+
+  typedef cub::BlockReduce<AccT, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage dy_storage;
+  __shared__ typename BlockReduce::TempStorage ddx_storage;
+  __shared__ typename BlockReduce::TempStorage dy_mul_ddx_storage;
+  __shared__ typename BlockReduce::TempStorage dy_mul_x_sub_mean_storage;
+  __shared__ typename BlockReduce::TempStorage ddx_mul_x_sub_mean_storage;
+  __shared__ AccT dy_sum_val;
+  __shared__ AccT ddx_sum_val;
+  __shared__ AccT dy_mul_ddx_sum_val;
+  __shared__ AccT dy_mul_x_sub_mean_sum_val;
+  __shared__ AccT ddx_mul_x_sub_mean_sum_val;
+
+  AccT dy_sum = 0;
+  AccT ddx_sum = 0;
+  AccT dy_mul_ddx_sum = 0;
+  AccT dy_mul_x_sub_mean_sum = 0;
+  AccT ddx_mul_x_sub_mean_sum = 0;
+  for (int i = beg_idx; i < end_idx; i += BlockDim) {
+    AccT ddx_i = static_cast<AccT>(ddx[i]);
+    AccT dy_i = static_cast<AccT>(dy[i]);
+    AccT tmp = static_cast<AccT>(x[i]) - mean_val;
+
+    dy_sum += dy_i;
+    ddx_sum += ddx_i;
+    dy_mul_ddx_sum += (ddx_i * dy_i);
+
+    dy_mul_x_sub_mean_sum += (dy_i * tmp);
+    ddx_mul_x_sub_mean_sum += (ddx_i * tmp);
+  }
+
+  dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
+  ddx_sum = BlockReduce(ddx_storage).Reduce(ddx_sum, cub::Sum());
+  dy_mul_ddx_sum =
+      BlockReduce(dy_mul_ddx_storage).Reduce(dy_mul_ddx_sum, cub::Sum());
+  dy_mul_x_sub_mean_sum = BlockReduce(dy_mul_x_sub_mean_storage)
+                              .Reduce(dy_mul_x_sub_mean_sum, cub::Sum());
+  ddx_mul_x_sub_mean_sum = BlockReduce(ddx_mul_x_sub_mean_storage)
+                               .Reduce(ddx_mul_x_sub_mean_sum, cub::Sum());
+
+  if (threadIdx.x == 0) {
+    dy_sum_val = dy_sum;
+    ddx_sum_val = ddx_sum;
+    dy_mul_ddx_sum_val = dy_mul_ddx_sum;
+    dy_mul_x_sub_mean_sum_val = dy_mul_x_sub_mean_sum;
+    ddx_mul_x_sub_mean_sum_val = ddx_mul_x_sub_mean_sum;
+  }
+  __syncthreads();
+
+  if (ddx != nullptr) {
+    for (int i = beg_idx; i < end_idx; i += BlockDim) {
+      AccT tmp = static_cast<AccT>(dx[i]);
+      tmp +=
+          ((static_cast<AccT>(x[i]) - mean_val) * var_val * var_val * var_val /
+               sample_size *
+               (ddx_sum_val * dy_sum_val / sample_size - dy_mul_ddx_sum_val +
+                3. * dy_mul_x_sub_mean_sum_val * var_val *
+                    ddx_mul_x_sub_mean_sum_val * var_val / sample_size) +
+           ddx_mul_x_sub_mean_sum_val * var_val / sample_size * var_val *
+               var_val * (dy_sum_val / sample_size - static_cast<AccT>(dy[i])) +
+           dy_mul_x_sub_mean_sum_val * var_val / sample_size * var_val *
+               var_val *
+               (ddx_sum_val / sample_size - static_cast<AccT>(ddx[i]))) *
+          scale[c];
+      dx[i] = static_cast<T>(tmp);
+    }
+  }
+  __syncthreads();
+  if (ddscale != nullptr) {
+    for (int i = beg_idx; i < end_idx; i += BlockDim) {
+      AccT tmp = static_cast<AccT>(dx[i]);
+      tmp += (static_cast<AccT>(dy[i]) * var_val -
+              dy_sum_val / sample_size * var_val -
+              (static_cast<AccT>(x[i]) - mean_val) * var_val *
+                  dy_mul_x_sub_mean_sum_val * var_val / sample_size) *
+             ddscale[c];
+      dx[i] = static_cast<T>(tmp);
+    }
+  }
+}
+
+template <typename T, typename AccT, int BlockDim>
+__global__ void DoubleGradComputeDDY(const T *x,
+                                     const AccT *mean,
+                                     const AccT *variance,
+                                     const AccT *ddscale,
+                                     const AccT *ddbias,
+                                     const T *ddx,
+                                     const AccT *scale,
+                                     int C,
+                                     int sample_size,
+                                     const double epsilon,
+                                     T *ddy) {
+  int beg_idx = blockIdx.x * sample_size + threadIdx.x;
+  int end_idx = (blockIdx.x + 1) * sample_size;
+  int ncid = blockIdx.x;
+  int c = ncid % C;
+  AccT mean_val = mean[ncid];
+  AccT var_val = variance[ncid];
+  typedef cub::BlockReduce<AccT, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage ddx_storage;
+  __shared__ typename BlockReduce::TempStorage ddx_mul_x_sub_mean_storage;
+  __shared__ AccT ddx_sum_val;
+  __shared__ AccT ddx_mul_x_sub_mean_sum_val;
+
+  AccT ddx_sum = 0;
+  AccT ddx_mul_x_sub_mean_sum = 0;
+  for (int i = beg_idx; i < end_idx; i += BlockDim) {
+    AccT ddx_i = static_cast<AccT>(ddx[i]);
+    ddx_sum += ddx_i;
+    ddx_mul_x_sub_mean_sum += (ddx_i * (static_cast<AccT>(x[i]) - mean_val));
+  }
+  ddx_sum = BlockReduce(ddx_storage).Reduce(ddx_sum, cub::Sum());
+  ddx_mul_x_sub_mean_sum = BlockReduce(ddx_mul_x_sub_mean_storage)
+                               .Reduce(ddx_mul_x_sub_mean_sum, cub::Sum());
+  if (threadIdx.x == 0) {
+    ddx_sum_val = ddx_sum;
+    ddx_mul_x_sub_mean_sum_val = ddx_mul_x_sub_mean_sum;
+  }
+  __syncthreads();
+  if (ddx != nullptr) {
+    for (int i = beg_idx; i < end_idx; i += BlockDim) {
+      AccT tmp = static_cast<AccT>(ddy[i]);
+      tmp += scale[c] * var_val *
+             (static_cast<AccT>(ddx[i]) - ddx_sum_val / sample_size -
+              (static_cast<AccT>(x[i]) - mean_val) * var_val *
+                  ddx_mul_x_sub_mean_sum_val * var_val / sample_size);
+      ddy[i] = static_cast<T>(tmp);
+    }
+  }
+  __syncthreads();
+  if (ddscale != nullptr) {
+    for (int i = beg_idx; i < end_idx; i += BlockDim) {
+      AccT tmp = static_cast<AccT>(ddy[i]);
+      tmp += (static_cast<AccT>(x[i]) - mean_val) * var_val * ddscale[c];
+      ddy[i] = static_cast<T>(tmp);
+    }
+  }
+  __syncthreads();
+  if (ddbias != nullptr) {
+    for (int i = beg_idx; i < end_idx; i += BlockDim) {
+      ddy[i] = static_cast<T>(static_cast<AccT>(ddy[i]) + ddbias[c]);
+    }
+  }
+}
+
+template <typename T, typename AccT, int BlockDim>
+__global__ void DoubleGradComputeDScale(const T *x,
+                                        const AccT *mean,
+                                        const AccT *variance,
+                                        const T *ddx,
+                                        const T *dy,
+                                        int C,
+                                        int sample_size,
+                                        const double epsilon,
+                                        AccT *dscale) {
+  int beg_idx = blockIdx.x * sample_size + threadIdx.x;
+  int end_idx = (blockIdx.x + 1) * sample_size;
+  int ncid = blockIdx.x;
+  int c = ncid % C;
+  AccT mean_val = mean[ncid];
+  AccT var_val = variance[ncid];
+  typedef cub::BlockReduce<AccT, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage dy_storage;
+  __shared__ typename BlockReduce::TempStorage dy_mul_x_sub_mean_storage;
+  __shared__ typename BlockReduce::TempStorage dscale_tmp_storage;
+  __shared__ AccT dy_sum_val;
+  __shared__ AccT dy_mul_x_sub_mean_sum_val;
+
+  AccT dy_sum = 0;
+  AccT dy_mul_x_sub_mean_sum = 0;
+  for (int i = beg_idx; i < end_idx; i += BlockDim) {
+    AccT dy_i = static_cast<AccT>(dy[i]);
+    dy_sum += dy_i;
+    dy_mul_x_sub_mean_sum += (dy_i * (static_cast<AccT>(x[i]) - mean_val));
+  }
+  dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
+  dy_mul_x_sub_mean_sum = BlockReduce(dy_mul_x_sub_mean_storage)
+                              .Reduce(dy_mul_x_sub_mean_sum, cub::Sum());
+
+  if (threadIdx.x == 0) {
+    dy_sum_val = dy_sum;
+    dy_mul_x_sub_mean_sum_val = dy_mul_x_sub_mean_sum;
+  }
+  __syncthreads();
+  if (ddx != nullptr) {
+    AccT dscale_tmp = 0;
+    for (int i = beg_idx; i < end_idx; i += BlockDim) {
+      dscale_tmp +=
+          static_cast<AccT>(ddx[i]) * var_val *
+          (static_cast<AccT>(dy[i]) - dy_sum_val / sample_size -
+           dy_mul_x_sub_mean_sum_val * (static_cast<AccT>(x[i]) - mean_val) *
+               var_val * var_val / sample_size);
+    }
+    dscale_tmp = BlockReduce(dscale_tmp_storage).Reduce(dscale_tmp, cub::Sum());
+    if (threadIdx.x == 0) {
+      dscale[ncid] += dscale_tmp;
+    }
+    __syncthreads();
+  }
+}
+
+template <typename T, typename Context>
+void InstanceNormGradKernel(const Context &dev_ctx,
+                            const DenseTensor &x,
+                            const paddle::optional<DenseTensor> &scale,
+                            const paddle::optional<DenseTensor> &bias UNUSED,
+                            const DenseTensor &saved_mean,
+                            const DenseTensor &saved_variance,
+                            const DenseTensor &d_y,
+                            float epsilon_f,
+                            DenseTensor *d_x,
+                            DenseTensor *d_scale,
+                            DenseTensor *d_bias) {
+  using AccT = typename phi::dtype::MPTypeTrait<T>::Type;
+  double epsilon = static_cast<double>(epsilon_f);
+  const auto *scale_ptr = scale.get_ptr();
+
+  const auto &x_dims = x.dims();
+
+  int N, C, H, W, D;
+  funcs::ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
+  int NxC = N * C;
+
+  DenseTensor x_tmp, d_y_tmp;
+  x_tmp.ShareDataWith(x).Resize({1, NxC, H, W, D});
+  d_y_tmp.ShareDataWith(d_y).Resize({1, NxC, H, W, D});
+
+  phi::funcs::SetConstant<GPUContext, AccT> set_constant;
+
+  dev_ctx.template Alloc<T>(d_x);
+  if (x.numel() == 0) {
+    if (d_scale) {
+      dev_ctx.template Alloc<AccT>(d_scale);
+      set_constant(dev_ctx, d_scale, static_cast<AccT>(0));
+    }
+    if (d_bias) {
+      dev_ctx.template Alloc<AccT>(d_bias);
+      set_constant(dev_ctx, d_bias, static_cast<AccT>(0));
+    }
+    return;
+  }
+  if (d_scale && d_bias) {
+    dev_ctx.template Alloc<AccT>(d_scale);
+    dev_ctx.template Alloc<AccT>(d_bias);
+  }
+
+  if (scale_ptr) {
+    PADDLE_ENFORCE_EQ(
+        scale_ptr->dims().size(),
+        1UL,
+        common::errors::InvalidArgument(
+            "The `shape` in InstanceNormOp is invalid: "
+            "the size of scale's dimensions must be equal to 1. But "
+            "received: the size of scale's dimensions"
+            "is [%d]",
+            scale_ptr->dims().size()));
+    PADDLE_ENFORCE_EQ(scale_ptr->dims()[0],
+                      C,
+                      common::errors::InvalidArgument(
+                          "The `shape` in InstanceNormOp is invalid: "
+                          "the first dimension of scale must be equal to "
+                          "Channels([%d]). But received: "
+                          "the first dimension of scale is [%d],"
+                          "the dimensions of scale is [%s], ",
+                          C,
+                          scale_ptr->dims()[0],
+                          scale_ptr->dims()));
+  }
+
+  const int n = x.numel();
+  const int block = 512;
+  int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+  const int max_blocks = std::max(max_threads / block, 1);
+  const int grid = std::min(NxC, max_blocks);
+  const int grid1 = (C + block - 1) / block;
+
+  DenseTensor scale_tmp;
+  scale_tmp.Resize({NxC});
+  dev_ctx.template Alloc<AccT>(&scale_tmp);
+
+  DenseTensor d_scale_tmp;
+  d_scale_tmp.Resize({NxC});
+  dev_ctx.template Alloc<AccT>(&d_scale_tmp);
+
+  DenseTensor d_bias_tmp;
+  d_bias_tmp.Resize({NxC});
+  dev_ctx.template Alloc<AccT>(&d_bias_tmp);
+  if (scale_ptr) {
+    repeat_param<AccT><<<grid, block, 0, dev_ctx.stream()>>>(
+        scale_ptr->data<AccT>(), scale_tmp.data<AccT>(), N, C);
+  } else {
+    set_constant(dev_ctx, &scale_tmp, static_cast<AccT>(1));
+  }
+  std::vector<int> dims;
+  std::vector<int> strides;
+  dims = {1, NxC, H, W, D};
+  strides = {NxC * H * W * D, H * W * D, W * D, D, 1};
+
+#ifdef PADDLE_WITH_HIP
+  miopenTensorDescriptor_t data_desc_;
+  miopenTensorDescriptor_t in_param_desc_;
+
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::miopenCreateTensorDescriptor(&data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::miopenCreateTensorDescriptor(&in_param_desc_));
+#else
+  cudnnTensorDescriptor_t data_desc_;
+  cudnnTensorDescriptor_t in_param_desc_;
+
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cudnnCreateTensorDescriptor(&in_param_desc_));
+#endif
+
+  if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
+    LOG(ERROR) << "Provided epsilon is smaller than "
+               << "CUDNN_BN_MIN_EPSILON. Setting it to "
+               << "CUDNN_BN_MIN_EPSILON instead.";
+  }
+  epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
+
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor(
+      data_desc_,
+      CudnnDataType<T>::type,
+      x_dims.size() > 3 ? x_dims.size() : 4,
+      const_cast<int *>(dims.data()),
+      const_cast<int *>(strides.data())));
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDeriveBNTensorDescriptor(
+      in_param_desc_, data_desc_, miopenBNSpatial));
+#else
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptor(
+      data_desc_,
+      CudnnDataType<T>::type,
+      x_dims.size() > 3 ? x_dims.size() : 4,
+      dims.data(),
+      strides.data()));
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDeriveBNTensorDescriptor(
+      in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL));
+#endif
+  const auto *saved_mean_data =
+      saved_mean.template data<BatchNormParamType<T>>();
+  const auto *saved_var_data =
+      saved_variance.template data<BatchNormParamType<T>>();
+
+  if (d_scale && d_bias) {
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenBatchNormalizationBackward(
+        GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace()),
+        miopenBNSpatial,
+        CudnnDataType<T>::kOne(),
+        CudnnDataType<T>::kZero(),
+        CudnnDataType<T>::kOne(),
+        CudnnDataType<T>::kZero(),
+        data_desc_,
+        x_tmp.template data<T>(),
+        data_desc_,
+        d_y_tmp.template data<T>(),
+        data_desc_,
+        d_x->template data<T>(),
+        in_param_desc_,
+        scale_tmp.template data<BatchNormParamType<T>>(),
+        d_scale_tmp.template data<BatchNormParamType<T>>(),
+        d_bias_tmp.template data<BatchNormParamType<T>>(),
+        epsilon,
+        saved_mean_data,
+        saved_var_data));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnBatchNormalizationBackward(
+        GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace()),
+        CUDNN_BATCHNORM_SPATIAL,
+        CudnnDataType<T>::kOne(),
+        CudnnDataType<T>::kZero(),
+        CudnnDataType<T>::kOne(),
+        CudnnDataType<T>::kZero(),
+        data_desc_,
+        x_tmp.template data<T>(),
+        data_desc_,
+        d_y_tmp.template data<T>(),
+        data_desc_,
+        d_x->template data<T>(),
+        in_param_desc_,
+        scale_tmp.template data<BatchNormParamType<T>>(),
+        d_scale_tmp.template data<BatchNormParamType<T>>(),
+        d_bias_tmp.template data<BatchNormParamType<T>>(),
+        epsilon,
+        saved_mean_data,
+        saved_var_data));
+#endif
+  } else {
+    if (d_x) {
+      GradComputeDX<T, block><<<NxC, block, 0, dev_ctx.stream()>>>(
+          d_y.data<T>(),
+          scale_tmp.data<BatchNormParamType<T>>(),
+          saved_mean_data,
+          x.data<T>(),
+          saved_var_data,
+          C,
+          H * W * D,
+          d_x->data<T>());
+    }
+  }
+  if (d_scale && d_bias) {
+    add_param<AccT, block, false><<<grid1, block, 0, dev_ctx.stream()>>>(
+        d_scale_tmp.data<AccT>(), d_scale->data<AccT>(), N, C);
+    add_param<AccT, block, false><<<grid1, block, 0, dev_ctx.stream()>>>(
+        d_bias_tmp.data<AccT>(), d_bias->data<AccT>(), N, C);
+  }
+
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::miopenDestroyTensorDescriptor(data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::miopenDestroyTensorDescriptor(in_param_desc_));
+#else
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cudnnDestroyTensorDescriptor(in_param_desc_));
+#endif
+}
+
+template <typename T, typename Context>
+void InstanceNormDoubleGradKernel(const Context &dev_ctx,
+                                  const DenseTensor &x,
+                                  const paddle::optional<DenseTensor> &scale,
+                                  const DenseTensor &saved_mean,
+                                  const DenseTensor &saved_variance,
+                                  const DenseTensor &dy,
+                                  const paddle::optional<DenseTensor> &ddx,
+                                  const paddle::optional<DenseTensor> &ddscale,
+                                  const paddle::optional<DenseTensor> &ddbias,
+                                  float epsilon_f,
+                                  DenseTensor *dx,
+                                  DenseTensor *dscale,
+                                  DenseTensor *ddy) {
+  using AccT = typename phi::dtype::MPTypeTrait<T>::Type;
+  const auto *Scale = scale.get_ptr();
+  const auto *ddX = ddx.get_ptr();
+  const auto *ddScale = ddscale.get_ptr();
+  const auto *ddBias = ddbias.get_ptr();
+  const double epsilon = static_cast<double>(epsilon_f);
+  const T *x_data = x.data<T>();
+  const T *dy_data = dy.data<T>();
+  const T *ddx_data = (ddX == nullptr ? nullptr : ddX->data<T>());
+  const AccT *ddscale_data =
+      (ddScale == nullptr ? nullptr : ddScale->data<AccT>());
+  const AccT *ddbias_data =
+      (ddScale == nullptr ? nullptr : ddBias->data<AccT>());
+  const AccT *mean_data = saved_mean.data<AccT>();
+  const AccT *variance_data = saved_variance.data<AccT>();
+  phi::funcs::SetConstant<GPUContext, T> set_zero;
+  phi::funcs::SetConstant<GPUContext, AccT> set_zero_AccT;
+
+  auto &x_dims = x.dims();
+  int N, C, H, W, D;
+  funcs::ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
+  int NxC = N * C;
+  const int n = x.numel();
+  int sample_size = n / N / C;
+
+  DenseTensor scale_tmp;
+  if (!Scale) {
+    scale_tmp.Resize({C});
+    dev_ctx.template Alloc<AccT>(&scale_tmp);
+    set_zero_AccT(dev_ctx, &scale_tmp, static_cast<AccT>(1));
+  }
+  const AccT *scale_data = Scale ? Scale->data<AccT>() : scale_tmp.data<AccT>();
+  const int block = 512;
+  int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+  const int max_blocks = std::max(max_threads / block, 1);
+  const int grid = NxC;
+  const int grid1 = (C + block - 1) / block;
+
+  if (dx) {
+    T *dx_data = dev_ctx.template Alloc<T>(dx);
+    set_zero(dev_ctx, dx, static_cast<T>(0));
+    DoubleGradComputeDX<T, AccT, block>
+        <<<grid, block, 0, dev_ctx.stream()>>>(x_data,
+                                               mean_data,
+                                               variance_data,
+                                               ddx_data,
+                                               dy_data,
+                                               scale_data,
+                                               ddscale_data,
+                                               C,
+                                               sample_size,
+                                               epsilon,
+                                               dx_data);
+  }
+  if (dscale) {
+    DenseTensor dscale_tmp;
+    dscale_tmp.Resize({NxC});
+    dev_ctx.template Alloc<AccT>(&dscale_tmp);
+    set_zero_AccT(dev_ctx, &dscale_tmp, static_cast<AccT>(0));
+    AccT *dscale_tmp_data = dscale_tmp.data<AccT>();
+
+    AccT *dscale_data = dev_ctx.template Alloc<AccT>(dscale);
+    set_zero_AccT(dev_ctx, dscale, static_cast<AccT>(0));
+    DoubleGradComputeDScale<T, AccT, block>
+        <<<grid, block, 0, dev_ctx.stream()>>>(x_data,
+                                               mean_data,
+                                               variance_data,
+                                               ddx_data,
+                                               dy_data,
+                                               C,
+                                               sample_size,
+                                               epsilon,
+                                               dscale_tmp_data);
+    add_param<AccT, block, false><<<grid1, block, 0, dev_ctx.stream()>>>(
+        dscale_tmp.data<AccT>(), dscale->data<AccT>(), N, C);
+  }
+  if (ddy) {
+    T *ddy_data = dev_ctx.template Alloc<T>(ddy);
+    set_zero(dev_ctx, ddy, static_cast<T>(0));
+    DoubleGradComputeDDY<T, AccT, block>
+        <<<grid, block, 0, dev_ctx.stream()>>>(x_data,
+                                               mean_data,
+                                               variance_data,
+                                               ddscale_data,
+                                               ddbias_data,
+                                               ddx_data,
+                                               scale_data,
+                                               C,
+                                               sample_size,
+                                               epsilon,
+                                               ddy_data);
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_PLUGIN_KERNEL(instance_norm_grad,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::InstanceNormGradKernel,
+                          float,
+                          double,
+                          phi::dtype::float16,
+                          phi::dtype::bfloat16) {}
+PD_REGISTER_PLUGIN_KERNEL(instance_norm_double_grad,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::InstanceNormDoubleGradKernel,
+                          float,
+                          double,
+                          phi::dtype::float16,
+                          phi::dtype::bfloat16) {}
diff --git a/backends/metax_gpu/kernels/metax_kernel/instance_norm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/instance_norm_kernel_register.cu
new file mode 100644
index 00000000000..db975d74665
--- /dev/null
+++ b/backends/metax_gpu/kernels/metax_kernel/instance_norm_kernel_register.cu
@@ -0,0 +1,253 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "glog/logging.h"
+#include "kernels/metax_context.h"
+#include "paddle/common/layout.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/norm_utils.h"
+#include "paddle/phi/kernels/gpu/instance_norm_utils.h"
+#include "paddle/phi/kernels/instance_norm_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void InstanceNormKernel(const Context &dev_ctx,
+                        const DenseTensor &x,
+                        const paddle::optional<DenseTensor> &scale,
+                        const paddle::optional<DenseTensor> &bias,
+                        float epsilon_f,
+                        DenseTensor *y,
+                        DenseTensor *saved_mean,
+                        DenseTensor *saved_variance) {
+  using AccT = typename phi::dtype::MPTypeTrait<T>::Type;
+  double epsilon = static_cast<double>(epsilon_f);
+  auto &x_dims = x.dims();
+  PADDLE_ENFORCE_GE(x_dims.size(),
+                    2,
+                    common::errors::InvalidArgument(
+                        "The `shape` in InstanceNormOp is invalid: "
+                        "the size of X's dimensions must greater than "
+                        "or equal to 2. But received: "
+                        "the size of X's dimensions is [%d]",
+                        x_dims.size()));
+  PADDLE_ENFORCE_LE(x_dims.size(),
+                    5,
+                    common::errors::InvalidArgument(
+                        "The `shape` in InstanceNormOp is invalid: "
+                        "the size of X's dimensions must smaller than"
+                        "or equal to 5. But received: "
+                        "the size of X's dimensions is [%d]",
+                        x_dims.size()));
+  int N, C, H, W, D;
+  funcs::ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
+  int NxC = N * C;
+  DenseTensor x_tmp;
+  x_tmp.ShareDataWith(x).Resize({1, NxC, H, W, D});
+  dev_ctx.template Alloc<T>(y);
+  phi::funcs::SetConstant<GPUContext, BatchNormParamType<T>> functor;
+  phi::funcs::SetConstant<GPUContext, T> functor_y;
+  if (x.numel() == 0) {
+    functor_y(dev_ctx, y, static_cast<T>(0));
+    if (saved_mean) {
+      dev_ctx.template Alloc<BatchNormParamType<T>>(saved_mean);
+      functor(dev_ctx, saved_mean, static_cast<BatchNormParamType<T>>(0));
+    }
+    if (saved_variance) {
+      dev_ctx.template Alloc<BatchNormParamType<T>>(saved_variance);
+      functor(dev_ctx, saved_variance, static_cast<BatchNormParamType<T>>(0));
+    }
+    return;
+  }
+
+#ifdef PADDLE_WITH_HIP
+  miopenTensorDescriptor_t data_desc_;
+  miopenTensorDescriptor_t in_param_desc_;
+
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::miopenCreateTensorDescriptor(&data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::miopenCreateTensorDescriptor(&in_param_desc_));
+#else
+  cudnnTensorDescriptor_t data_desc_;
+  cudnnTensorDescriptor_t in_param_desc_;
+
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cudnnCreateTensorDescriptor(&in_param_desc_));
+#endif
+  if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
+    LOG(ERROR) << "Provided epsilon is smaller than "
+               << "CUDNN_BN_MIN_EPSILON. Setting it to "
+               << "CUDNN_BN_MIN_EPSILON instead.";
+  }
+  epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
+
+  VLOG(3) << "Setting descriptors.";
+  std::vector<int> dims;
+  std::vector<int> strides;
+  dims = {1, NxC, H, W, D};
+  strides = {NxC * H * W * D, H * W * D, W * D, D, 1};
+
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor(
+      data_desc_,
+      CudnnDataType<T>::type,
+      x_dims.size() > 3 ? x_dims.size() : 4,
+      const_cast<int *>(dims.data()),
+      const_cast<int *>(strides.data())));
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDeriveBNTensorDescriptor(
+      in_param_desc_, data_desc_, miopenBNSpatial));
+#else
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptor(
+      data_desc_,
+      CudnnDataType<T>::type,
+      x_dims.size() > 3 ? x_dims.size() : 4,
+      dims.data(),
+      strides.data()));
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDeriveBNTensorDescriptor(
+      in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL));
+#endif
+
+  const auto scale_ptr = scale.get_ptr();
+  const auto bias_ptr = bias.get_ptr();
+
+  DenseTensor scale_tmp;
+  scale_tmp.Resize({NxC});
+  dev_ctx.template Alloc<AccT>(&scale_tmp);
+  DenseTensor bias_tmp;
+  bias_tmp.Resize({NxC});
+  dev_ctx.template Alloc<AccT>(&bias_tmp);
+
+  const int n = x.numel();
+  const int block = 512;
+  int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+  const int max_blocks = std::max(max_threads / block, 1);
+  const int grid = std::min((NxC + block - 1) / block, max_blocks);
+
+  phi::funcs::SetConstant<GPUContext, AccT> set_constant;
+  if (scale_ptr) {
+    repeat_param<AccT><<<grid, block, 0, dev_ctx.stream()>>>(
+        scale_ptr->data<AccT>(), scale_tmp.data<AccT>(), N, C);
+  } else {
+    set_constant(dev_ctx, &scale_tmp, static_cast<AccT>(1));
+  }
+  if (bias_ptr) {
+    repeat_param<AccT><<<grid, block, 0, dev_ctx.stream()>>>(
+        bias_ptr->data<AccT>(), bias_tmp.data<AccT>(), N, C);
+  } else {
+    set_constant(dev_ctx, &bias_tmp, static_cast<AccT>(0));
+  }
+
+  auto handle = GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace());
+  DenseTensor saved_mean_tmp, saved_variance_tmp;
+
+  if (saved_mean) {
+    dev_ctx.template Alloc<BatchNormParamType<T>>(saved_mean);
+    functor(dev_ctx, saved_mean, static_cast<BatchNormParamType<T>>(0));
+  } else {
+    saved_mean_tmp = phi::Full<BatchNormParamType<T>>(
+        dev_ctx, {NxC}, static_cast<BatchNormParamType<T>>(0));
+  }
+  if (saved_variance) {
+    dev_ctx.template Alloc<BatchNormParamType<T>>(saved_variance);
+    functor(dev_ctx, saved_variance, static_cast<BatchNormParamType<T>>(0));
+  } else {
+    saved_variance_tmp = phi::Full<BatchNormParamType<T>>(
+        dev_ctx, {NxC}, static_cast<BatchNormParamType<T>>(0));
+  }
+  auto *saved_mean_data = saved_mean
+                              ? saved_mean->data<BatchNormParamType<T>>()
+                              : saved_mean_tmp.data<BatchNormParamType<T>>();
+  auto *saved_variance_data =
+      saved_variance ? saved_variance->data<BatchNormParamType<T>>()
+                     : saved_variance_tmp.data<BatchNormParamType<T>>();
+
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::miopenBatchNormalizationForwardTraining(
+          handle,
+          miopenBNSpatial,
+          const_cast<void *>(
+              static_cast<const void *>(CudnnDataType<T>::kOne())),
+          const_cast<void *>(
+              static_cast<const void *>(CudnnDataType<T>::kZero())),
+          data_desc_,
+          static_cast<const void *>(x_tmp.template data<T>()),
+          data_desc_,
+          static_cast<void *>(y->template data<T>()),
+          in_param_desc_,
+          const_cast<void *>(static_cast<const void *>(
+              scale_tmp.template data<BatchNormParamType<T>>())),
+          const_cast<void *>(static_cast<const void *>(
+              bias_tmp.template data<BatchNormParamType<T>>())),
+          0,
+          nullptr,
+          nullptr,
+          epsilon,
+          static_cast<void *>(saved_mean_data),
+          static_cast<void *>(saved_variance_data)));
+
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::miopenDestroyTensorDescriptor(data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::miopenDestroyTensorDescriptor(in_param_desc_));
+#else
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cudnnBatchNormalizationForwardTraining(
+          handle,
+          CUDNN_BATCHNORM_SPATIAL,
+          CudnnDataType<T>::kOne(),
+          CudnnDataType<T>::kZero(),
+          data_desc_,
+          x_tmp.template data<T>(),
+          data_desc_,
+          y->template data<T>(),
+          in_param_desc_,
+          scale_tmp.template data<BatchNormParamType<T>>(),
+          bias_tmp.template data<BatchNormParamType<T>>(),
+          0,
+          nullptr,
+          nullptr,
+          epsilon,
+          saved_mean_data,
+          saved_variance_data));
+
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cudnnDestroyTensorDescriptor(in_param_desc_));
+#endif
+}
+
+}  // namespace phi
+
+PD_REGISTER_PLUGIN_KERNEL(instance_norm,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::InstanceNormKernel,
+                          float,
+                          double,
+                          phi::dtype::float16,
+                          phi::dtype::bfloat16) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16 ||
+      kernel_key.dtype() == phi::DataType::BFLOAT16) {
+    kernel->InputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->InputAt(2).SetDataType(phi::DataType::FLOAT32);
+  }
+}
diff --git a/backends/metax_gpu/kernels/metax_kernel/spectral_norm_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/spectral_norm_grad_kernel_register.cu
new file mode 100644
index 00000000000..f99621f8ab9
--- /dev/null
+++ b/backends/metax_gpu/kernels/metax_kernel/spectral_norm_grad_kernel_register.cu
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/spectral_norm_grad_kernel.cu"  // NOLINT
+
+PD_CUSTOM_KERNEL_REGISTER(spectral_norm_grad,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::SpectralNormGradKernel,
+                          float,
+                          double) {}
diff --git a/backends/metax_gpu/kernels/metax_kernel/spectral_norm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/spectral_norm_kernel_register.cu
new file mode 100644
index 00000000000..466937f993b
--- /dev/null
+++ b/backends/metax_gpu/kernels/metax_kernel/spectral_norm_kernel_register.cu
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/spectral_norm_kernel.cu"  // NOLINT
+
+PD_CUSTOM_KERNEL_REGISTER(spectral_norm,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::SpectralNormKernel,
+                          float,
+                          double) {}
diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch
index 184599263fa..682cee35caf 100755
--- a/backends/metax_gpu/patch/paddle.patch
+++ b/backends/metax_gpu/patch/paddle.patch
@@ -1028,6 +1028,468 @@ index 6f03f76eeb..5fe2c3e7dc 100644
  #include "paddle/phi/kernels/funcs/for_range.h"
  #include "paddle/phi/kernels/funcs/matrix_inverse.h"
  
+diff --git a/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h b/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h
+index 4099d8b506..baef2cd643 100644
+--- a/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h
++++ b/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h
+@@ -14,7 +14,7 @@
+ 
+ #pragma once
+ 
+-#include "paddle/phi/kernels/funcs/blas/blas.h"
++#include "kernels/funcs/blas/blas.h"
+ #include "paddle/phi/kernels/funcs/eigen/common.h"
+ #include "paddle/phi/kernels/funcs/math_function.h"
+ 
+diff --git a/test/legacy_test/test_batch_norm_op.py b/test/legacy_test/test_batch_norm_op.py
+index 4a5660ea0e..ca4e456e02 100644
+--- a/test/legacy_test/test_batch_norm_op.py
++++ b/test/legacy_test/test_batch_norm_op.py
+@@ -22,7 +22,9 @@ from op_test import (
+     _set_use_system_allocator,
+     convert_float_to_uint16,
+     convert_uint16_to_float,
+-    get_places,
++    get_devices,
++    is_custom_device,
++    get_device_place,
+ )
+ 
+ import paddle
+@@ -189,6 +191,7 @@ def _reference_grad(x, y_grad, scale, mean, var, epsilon, data_format):
+ 
+ 
+ def create_or_get_tensor(scope, var_name, var, place):
++    
+     tensor = scope.var(var_name).get_tensor()
+     if var is not None:
+         assert isinstance(var, np.ndarray)
+@@ -321,7 +324,6 @@ class TestBatchNormOpInference(unittest.TestCase):
+             fuse_with_relu=self.fuse_with_relu,
+             epsilon=epsilon,
+         )
+-
+         batch_norm_op.run(scope, place)
+ 
+         # When op is called without Executor then
+@@ -454,7 +456,7 @@ class TestBatchNormOpInference(unittest.TestCase):
+         )
+ 
+     def test_check_output(self):
+-        for place in get_places():
++        for place in get_devices():
+             for data_format in ["NCHW", "NHWC"]:
+                 self.check_with_place(
+                     place,
+@@ -488,8 +490,8 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference):
+ 
+     def test_check_output(self):
+         places = []
+-        if core.is_compiled_with_cuda():
+-            place = core.CUDAPlace(0)
++        if core.is_compiled_with_cuda() or is_custom_device():
++            place = get_device_place()
+             if core.is_float16_supported(place):
+                 places.append(place)
+         for place in places:
+@@ -510,8 +512,8 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference):
+ 
+ 
+ @unittest.skipIf(
+-    not core.is_compiled_with_cuda()
+-    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
++    not (core.is_compiled_with_cuda() or is_custom_device())
++    or not core.is_bfloat16_supported(get_device_place()),
+     "core is not compiled with CUDA or not support the bfloat16",
+ )
+ class TestBF16BatchNormOpInference(TestBatchNormOpInference):
+@@ -522,7 +524,7 @@ class TestBF16BatchNormOpInference(TestBatchNormOpInference):
+         self.init_kernel_type()
+ 
+     def test_check_output(self):
+-        places = [core.CUDAPlace(0)]
++        places = [get_device_place()]
+         for place in places:
+             # for data_format in ["NCHW", "NHWC"]:
+             for data_format in ["NCHW"]:
+@@ -562,7 +564,7 @@ class TestDygraphBatchNormAPIError(unittest.TestCase):
+ 
+ class TestDygraphBatchNormTrainableStats(unittest.TestCase):
+     def test_dygraph(self):
+-        for p in get_places():
++        for p in get_devices():
+             shape = [4, 10, 4, 4]
+ 
+             def compute(x, is_test, trainable_statistics):
+@@ -581,7 +583,7 @@ class TestDygraphBatchNormTrainableStats(unittest.TestCase):
+             np.testing.assert_allclose(y1, y2, rtol=1e-05)
+ 
+     def test_static(self):
+-        for p in get_places():
++        for p in get_devices():
+             exe = base.Executor(p)
+             shape = [4, 10, 16, 16]
+ 
+@@ -625,7 +627,7 @@ class TestDygraphBatchNormOpenReserveSpace(unittest.TestCase):
+ 
+ class TestBatchNormAPI_ZeroSize(unittest.TestCase):
+     def setUp(self):
+-        self.places = get_places()
++        self.places = get_devices()
+ 
+     def test_dygraph(self):
+         for place in self.places:
+diff --git a/test/legacy_test/test_conv3d_transpose_op.py b/test/legacy_test/test_conv3d_transpose_op.py
+index c9853e9073..277eb26d00 100644
+--- a/test/legacy_test/test_conv3d_transpose_op.py
++++ b/test/legacy_test/test_conv3d_transpose_op.py
+@@ -19,7 +19,7 @@ import numpy as np
+ import paddle
+ 
+ paddle.enable_static()
+-from op_test import OpTest, copy_bits_from_float_to_uint16
++from op_test import OpTest, copy_bits_from_float_to_uint16, is_custom_device, get_devices, get_device_place
+ 
+ from paddle.base import core
+ 
+@@ -150,7 +150,7 @@ def conv3dtranspose_forward_naive(input_, filter_, attrs):
+ 
+ def create_test_cudnn_fp16_class(parent, grad_check=True):
+     @unittest.skipIf(
+-        not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
++        not ((core.is_compiled_with_cuda() or is_custom_device()) or is_custom_device()), "core is not compiled with CUDA"
+     )
+     class TestConv3DTransposeCUDNNFP16(parent):
+         def init_kernel_type(self):
+@@ -158,20 +158,20 @@ def create_test_cudnn_fp16_class(parent, grad_check=True):
+             self.dtype = np.float16
+ 
+         def test_check_output(self):
+-            if core.is_compiled_with_cuda():
+-                place = core.CUDAPlace(0)
++            if ((core.is_compiled_with_cuda() or is_custom_device()) or is_custom_device()):
++                place = get_device_place()
+                 if core.is_float16_supported(place):
+                     self.check_output_with_place(place, atol=2e-2)
+ 
+         def test_check_grad_no_filter(self):
+-            place = core.CUDAPlace(0)
++            place = get_device_place()
+             if core.is_float16_supported(place) and grad_check:
+                 self.check_grad_with_place(
+                     place, ['Input'], 'Output', no_grad_set={'Filter'}
+                 )
+ 
+         def test_check_grad_no_input(self):
+-            place = core.CUDAPlace(0)
++            place = get_device_place()
+             if core.is_float16_supported(place) and grad_check:
+                 self.check_grad_with_place(
+                     place, ['Filter'], 'Output', no_grad_set={'Input'}
+@@ -184,8 +184,8 @@ def create_test_cudnn_fp16_class(parent, grad_check=True):
+ 
+ def create_test_cudnn_bf16_class(parent):
+     @unittest.skipIf(
+-        not core.is_compiled_with_cuda()
+-        or not core.is_bfloat16_supported(core.CUDAPlace(0)),
++        not (core.is_compiled_with_cuda() or is_custom_device())
++        or not core.is_bfloat16_supported(get_device_place()),
+         "core is not compiled with CUDA and do not support bfloat16",
+     )
+     class TestConv3DTransposeCUDNNBF16(parent):
+@@ -194,11 +194,11 @@ def create_test_cudnn_bf16_class(parent):
+             self.dtype = np.uint16
+ 
+         def test_check_output(self):
+-            place = core.CUDAPlace(0)
++            place = get_device_place()
+             self.check_output_with_place(place)
+ 
+         def test_check_grad(self):
+-            place = core.CUDAPlace(0)
++            place = get_device_place()
+             self.check_grad_with_place(
+                 place,
+                 {'Input', 'Filter'},
+@@ -206,7 +206,7 @@ def create_test_cudnn_bf16_class(parent):
+             )
+ 
+         def test_check_grad_no_filter(self):
+-            place = core.CUDAPlace(0)
++            place = get_device_place()
+             self.check_grad_with_place(
+                 place,
+                 ['Input'],
+@@ -215,7 +215,7 @@ def create_test_cudnn_bf16_class(parent):
+             )
+ 
+         def test_check_grad_no_input(self):
+-            place = core.CUDAPlace(0)
++            place = get_device_place()
+             self.check_grad_with_place(
+                 place,
+                 ['Filter'],
+@@ -306,14 +306,14 @@ class TestConv3DTransposeOp(OpTest):
+ 
+     def test_check_output(self):
+         if self.use_cudnn:
+-            place = core.CUDAPlace(0)
++            place = get_device_place()
+             self.check_output_with_place(place, atol=1e-5)
+         else:
+             self.check_output()
+ 
+     def test_check_grad(self):
+         if self.use_cudnn:
+-            place = core.CUDAPlace(0)
++            place = get_device_place()
+             self.check_grad_with_place(
+                 place,
+                 {'Input', 'Filter'},
+@@ -327,7 +327,7 @@ class TestConv3DTransposeOp(OpTest):
+ 
+     def test_check_grad_no_filter(self):
+         if self.use_cudnn:
+-            place = core.CUDAPlace(0)
++            place = get_device_place()
+             self.check_grad_with_place(
+                 place,
+                 ['Input'],
+@@ -345,7 +345,7 @@ class TestConv3DTransposeOp(OpTest):
+ 
+     def test_check_grad_no_input(self):
+         if self.use_cudnn:
+-            place = core.CUDAPlace(0)
++            place = get_device_place()
+             self.check_grad_with_place(
+                 place,
+                 ['Filter'],
+@@ -471,7 +471,7 @@ class Test_NHWC(TestConv3DTransposeOp):
+ 
+ # ------------ test_cudnn ------------
+ @unittest.skipIf(
+-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
++    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
+ )
+ class TestCUDNN(TestConv3DTransposeOp):
+     def init_op_type(self):
+@@ -481,7 +481,7 @@ class TestCUDNN(TestConv3DTransposeOp):
+ 
+ 
+ @unittest.skipIf(
+-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
++    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
+ )
+ class TestCUDNNWithSymmetricPad(TestWithSymmetricPad):
+     def init_test_case(self):
+@@ -500,7 +500,7 @@ class TestCUDNNWithSymmetricPad(TestWithSymmetricPad):
+ 
+ 
+ @unittest.skipIf(
+-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
++    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
+ )
+ class TestCUDNNWithAsymmetricPad(TestWithAsymmetricPad):
+     def init_test_case(self):
+@@ -519,7 +519,7 @@ class TestCUDNNWithAsymmetricPad(TestWithAsymmetricPad):
+ 
+ 
+ @unittest.skipIf(
+-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
++    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
+ )
+ class TestCUDNNWithSAMEPad(TestWithSAMEPad):
+     def init_test_case(self):
+@@ -538,7 +538,7 @@ class TestCUDNNWithSAMEPad(TestWithSAMEPad):
+ 
+ 
+ @unittest.skipIf(
+-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
++    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
+ )
+ class TestCUDNNWithVALIDPad(TestWithVALIDPad):
+     def init_test_case(self):
+@@ -557,7 +557,7 @@ class TestCUDNNWithVALIDPad(TestWithVALIDPad):
+ 
+ 
+ @unittest.skipIf(
+-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
++    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
+ )
+ class TestCUDNNWithStride(TestWithStride):
+     def init_test_case(self):
+@@ -576,7 +576,7 @@ class TestCUDNNWithStride(TestWithStride):
+ 
+ 
+ @unittest.skipIf(
+-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
++    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
+ )
+ class TestCUDNNWithGroups(TestWithGroups):
+     def init_test_case(self):
+@@ -610,7 +610,7 @@ class TestCUDNNWithGroups(TestWithGroups):
+ 
+ 
+ @unittest.skipIf(
+-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
++    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
+ )
+ class TestCUDNN_NHWC(TestConv3DTransposeOp):
+     def init_test_case(self):
+@@ -630,7 +630,7 @@ class TestCUDNN_NHWC(TestConv3DTransposeOp):
+ 
+ 
+ @unittest.skipIf(
+-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
++    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
+ )
+ class TestCUDNNWithSymmetricPad_NHWC(TestWithSymmetricPad):
+     def init_test_case(self):
+@@ -650,7 +650,7 @@ class TestCUDNNWithSymmetricPad_NHWC(TestWithSymmetricPad):
+ 
+ 
+ @unittest.skipIf(
+-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
++    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
+ )
+ class TestCUDNNWithAsymmetricPad_NHWC(TestWithAsymmetricPad):
+     def init_test_case(self):
+@@ -670,7 +670,7 @@ class TestCUDNNWithAsymmetricPad_NHWC(TestWithAsymmetricPad):
+ 
+ 
+ @unittest.skipIf(
+-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
++    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
+ )
+ class TestCUDNNWithStride_NHWC(TestWithStride):
+     def init_test_case(self):
+@@ -690,7 +690,7 @@ class TestCUDNNWithStride_NHWC(TestWithStride):
+ 
+ 
+ @unittest.skipIf(
+-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
++    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
+ )
+ class TestCUDNNWithGroups_NHWC(TestWithGroups):
+     def init_test_case(self):
+diff --git a/test/legacy_test/test_cross_entropy_op.py b/test/legacy_test/test_cross_entropy_op.py
+index 74eedb6a48..e4c6ecb98a 100644
+--- a/test/legacy_test/test_cross_entropy_op.py
++++ b/test/legacy_test/test_cross_entropy_op.py
+@@ -20,6 +20,8 @@ from op_test import (
+     get_places,
+     paddle_static_guard,
+     randomize_probability,
++    is_custom_device,
++    get_device_place,
+ )
+ 
+ import paddle
+@@ -385,19 +387,19 @@ class TestCrossEntropyOp7RemoveLastDim(TestCrossEntropyOp7):
+ # Add Fp16 test
+ def create_test_class(parent, cls_name):
+     @unittest.skipIf(
+-        not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
++        not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
+     )
+     class TestCrossEntropyFP16Op(parent):
+         def init_dtype_type(self):
+             return np.float16
+ 
+         def test_check_output(self):
+-            place = core.CUDAPlace(0)
++            place = get_device_place()
+             if core.is_float16_supported(place):
+                 self.check_output_with_place(place, atol=2e-1)
+ 
+         def test_check_grad(self):
+-            place = core.CUDAPlace(0)
++            place = get_device_place()
+             if core.is_float16_supported(place):
+                 self.check_grad_with_place(
+                     place, ['X'], 'Y', max_relative_error=0.9
+diff --git a/test/legacy_test/test_fmin_op.py b/test/legacy_test/test_fmin_op.py
+index 4c9944e877..e6ed5c0f8e 100644
+--- a/test/legacy_test/test_fmin_op.py
++++ b/test/legacy_test/test_fmin_op.py
+@@ -15,8 +15,7 @@
+ import unittest
+ 
+ import numpy as np
+-from op_test import OpTest, convert_float_to_uint16
+-
++from op_test import OpTest, convert_float_to_uint16, is_custom_device, get_devices, get_device_place
+ import paddle
+ from paddle.base import core
+ 
+@@ -28,8 +27,8 @@ class ApiFMinTest(unittest.TestCase):
+ 
+     def setUp(self):
+         """setUp"""
+-        if core.is_compiled_with_cuda():
+-            self.place = core.CUDAPlace(0)
++        if core.is_compiled_with_cuda() or is_custom_device():
++            self.place = get_device_place()
+         else:
+             self.place = core.CPUPlace()
+ 
+@@ -259,8 +258,8 @@ class TestElementwiseFmin3Op(OpTest):
+ 
+ 
+ @unittest.skipIf(
+-    not core.is_compiled_with_cuda()
+-    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
++    not (core.is_compiled_with_cuda() or is_custom_device())
++    or not core.is_bfloat16_supported(get_device_place()),
+     "core is not compiled with CUDA and not support the bfloat16",
+ )
+ class TestFminBF16OP(OpTest):
+@@ -281,13 +280,13 @@ class TestFminBF16OP(OpTest):
+         self.outputs = {'Out': convert_float_to_uint16(out)}
+ 
+     def test_check_output(self):
+-        place = core.CUDAPlace(0)
++        place = get_device_place()
+         self.check_output_with_place(
+             place, check_pir=True, check_symbol_infer=False
+         )
+ 
+     def test_check_grad(self):
+-        place = core.CUDAPlace(0)
++        place = get_device_place()
+         self.check_grad_with_place(
+             place, ['X', 'Y'], 'Out', check_pir=True, check_prim_pir=True
+         )
+@@ -304,7 +303,7 @@ class TestElementwiseFminOpZeroSize1(TestElementwiseFminOp):
+ 
+ 
+ @unittest.skipIf(
+-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
++    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
+ )
+ class TestElementwiseFminOp_Stride(OpTest):
+     no_need_check_grad = True
+@@ -335,7 +334,7 @@ class TestElementwiseFminOp_Stride(OpTest):
+         self.val_dtype = np.float64
+ 
+     def test_check_output(self):
+-        place = core.CUDAPlace(0)
++        place = get_device_place()
+         self.check_strided_forward = True
+         self.check_output(
+             place,
+diff --git a/test/legacy_test/test_spectral_norm_op.py b/test/legacy_test/test_spectral_norm_op.py
+index 80e5c2ec63..f1602a8b40 100644
+--- a/test/legacy_test/test_spectral_norm_op.py
++++ b/test/legacy_test/test_spectral_norm_op.py
+@@ -112,6 +112,7 @@ class TestSpectralNormOpNoGrad2(TestSpectralNormOpNoGrad):
+ 
+ class TestSpectralNormOp(TestSpectralNormOpNoGrad):
+     def test_check_grad_ignore_uv(self):
++        
+         self.check_grad(
+             ['Weight'],
+             'Out',
 diff --git a/third_party/flagcx b/third_party/flagcx
 index 77495cd6a8..7e6c4cc3ca 160000
 --- a/third_party/flagcx

From a1530d2b4a9837dc9975fff03fac774a45ea702d Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Tue, 9 Sep 2025 15:41:45 +0800
Subject: [PATCH 002/121] [metax]change_cupti_and_fix_softmax (#7)

* [Metax_change_ut]

* fix sum&collect_fpn_proposals op register

* modify profile

* [Metax] fix paddle bug replace 'MoeGradDispatchKernel' to 'MoeGateDispatchKernel'

* [Metax] register bce_loss_grad & bce_loss & index_add_grad kernels

* [Metax] con2d_grad use gpudnn

* blas handle support

* [Metax] register some kernels & update CMakeLists

* [Metax] fix metax unittest fail

* [Metax] add group_norm & label_smooth kernel and update matmul kernel

* [Metax] fix rmsprop kernel register and add meshgrid & meshgrid_grad kernel register

* add test

* add test

* [test]  chang the logic of workspace_host in cholesky_kernel_register

alloc(cpuplace,size), test pass
alloc(cpuplace, size, stream), crash

* [Metax] fix compile fail

* Revert "[Metax] fix compile fail"

This reverts commit 83bc87f686227962b0262e044225c6ed5507b824.

* [Metax] fix compile fail by 'conv_transpose_grad_kernel_impl.h'

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] con2d_grad use gpudnn

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] change_patch

* [Metax] update unit test CMakeLists.txt

* [Metax] update unit test CMakeLists.txt

* [feature] add unique_consecutive kernel

* [metax] add some kernel

* [metax] add some kernel

* [Metax] register baddbmm kernel & update blas api

* [Metax] register baddbmm kernel & update blas api

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [feature]  add add unique_consecutive kernel.cu

* [fix] fix some test case due to missing op register

* [fix]  fix some fail text

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax]fix lu eigvalshsqueeze rnn kernel

* add and fix some kernels

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [Metax] fix conflict

* [Metax] adapt to paddle-cpu-20250901 & resolve the issue of 'test_elementwise_mul_op_metax' failure

* [Metax] update repeat_interleave kernel & ignore max op test

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax] chang patch fix copy

* [metax] chang patch fix copy

* [Metax] update metax_gpu unit test

* [Metax] fix test CMakeList.txt

* [metax]change_cupti_and_fix_softmax

* [metax]change_patch

* [metax]change_patch

---------

Co-authored-by: Mingkun.Zhang <2496808993@qq.com>
Co-authored-by: metax666 <metax_pde@outlook.com>
Co-authored-by: jiaxinWang-metax <189149612@qq.com>
Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Co-authored-by: chezhang <1376507468@qq.com>
Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com>
Co-authored-by: ZhouDuan <1184319564@qq.com>
---
 backends/metax_gpu/kernels/funcs/softmax.cu   | 168 ++++++
 .../cross_entropy_grad_kernel_register.cu     |  10 +-
 backends/metax_gpu/patch/paddle.patch         | 511 ++----------------
 .../metax_gpu/runtime/process_cupti_data.cc   | 136 +++--
 4 files changed, 309 insertions(+), 516 deletions(-)
 create mode 100644 backends/metax_gpu/kernels/funcs/softmax.cu

diff --git a/backends/metax_gpu/kernels/funcs/softmax.cu b/backends/metax_gpu/kernels/funcs/softmax.cu
new file mode 100644
index 00000000000..d738a53f43a
--- /dev/null
+++ b/backends/metax_gpu/kernels/funcs/softmax.cu
@@ -0,0 +1,168 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <vector>
+
+#include "kernels/metax_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_dnn.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/softmax.h"
+#include "paddle/phi/kernels/funcs/softmax_impl.h"
+
+namespace phi {
+namespace funcs {
+
+using ScopedTensorDescriptor = phi::backends::gpu::ScopedTensorDescriptor;
+using DataLayout = phi::backends::gpu::DataLayout;
+template <typename T>
+using CudnnDataType = phi::backends::gpu::CudnnDataType<T>;
+
+template <typename T, typename DeviceContext>
+void SoftmaxCUDNNFunctor<T, DeviceContext>::operator()(
+    const DeviceContext& dev_ctx,
+    const phi::DenseTensor* X,
+    phi::DenseTensor* Y) {
+  // ------------------- cudnn descriptors ---------------------
+  ScopedTensorDescriptor xDesc;
+  ScopedTensorDescriptor yDesc;
+  std::vector<int> cudnn_tensor_dims = common::vectorize<int>(X->dims());
+  DataLayout layout = DataLayout::kNCHW;
+  if (cudnn_tensor_dims.size() == 5) {
+    layout = DataLayout::kNCDHW;
+  }
+  // NOTE(*) : cudnn softmax only support >= 4D phi::DenseTensor,
+  // fill 1 at unused dims
+  if (cudnn_tensor_dims.size() <= 2) {
+    cudnn_tensor_dims.resize(4, 1);
+  }
+#ifdef PADDLE_WITH_HIP
+  miopenTensorDescriptor_t cudnn_x_desc =
+      xDesc.descriptor<T>(layout, cudnn_tensor_dims);
+  miopenTensorDescriptor_t cudnn_y_desc =
+      xDesc.descriptor<T>(layout, cudnn_tensor_dims);
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::miopenSoftmaxForward_V2(dev_ctx.cudnn_handle(),
+                                            CudnnDataType<T>::kOne(),
+                                            cudnn_x_desc,
+                                            X->data<T>(),
+                                            CudnnDataType<T>::kZero(),
+                                            cudnn_y_desc,
+                                            dev_ctx.template Alloc<T>(Y),
+                                            MIOPEN_SOFTMAX_ACCURATE,
+                                            MIOPEN_SOFTMAX_MODE_INSTANCE));
+#else
+  cudnnTensorDescriptor_t cudnn_x_desc =
+      xDesc.descriptor<T>(layout, cudnn_tensor_dims);
+  cudnnTensorDescriptor_t cudnn_y_desc =
+      xDesc.descriptor<T>(layout, cudnn_tensor_dims);
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSoftmaxForward(
+      GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace()),
+      CUDNN_SOFTMAX_ACCURATE,
+      CUDNN_SOFTMAX_MODE_INSTANCE,
+      CudnnDataType<T>::kOne(),
+      cudnn_x_desc,
+      X->data<T>(),
+      CudnnDataType<T>::kZero(),
+      cudnn_y_desc,
+      dev_ctx.template Alloc<T>(Y)));
+#endif
+}
+
+template <typename T, typename DeviceContext>
+void SoftmaxGradCUDNNFunctor<T, DeviceContext>::operator()(
+    const DeviceContext& dev_ctx,
+    const phi::DenseTensor* Y,
+    const phi::DenseTensor* YGrad,
+    phi::DenseTensor* XGrad) {
+  // ------------------- cudnn descriptors ---------------------
+  ScopedTensorDescriptor yDesc;
+  ScopedTensorDescriptor dyDesc;
+  ScopedTensorDescriptor dxDesc;
+  std::vector<int> cudnn_tensor_dims = common::vectorize<int>(Y->dims());
+  DataLayout layout = DataLayout::kNCHW;
+  if (cudnn_tensor_dims.size() == 5) {
+    layout = DataLayout::kNCDHW;
+  }
+  // NOTE(*) : cudnn softmax only support >= 4D phi::DenseTensor,
+  // fill 1 at unused dims
+  if (cudnn_tensor_dims.size() <= 2) {
+    cudnn_tensor_dims.resize(4, 1);
+  }
+#ifdef PADDLE_WITH_HIP
+  miopenTensorDescriptor_t cudnn_y_desc =
+      yDesc.descriptor<T>(layout, cudnn_tensor_dims);
+  miopenTensorDescriptor_t cudnn_xgrad_desc =
+      dxDesc.descriptor<T>(layout, cudnn_tensor_dims);
+  miopenTensorDescriptor_t cudnn_ygrad_desc =
+      dyDesc.descriptor<T>(layout, cudnn_tensor_dims);
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSoftmaxBackward_V2(
+      GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace()),
+      CudnnDataType<T>::kOne(),
+      cudnn_y_desc,
+      Y->data<T>(),
+      cudnn_ygrad_desc,
+      YGrad->data<T>(),
+      CudnnDataType<T>::kZero(),
+      cudnn_xgrad_desc,
+      dev_ctx.template Alloc<T>(XGrad),
+      MIOPEN_SOFTMAX_ACCURATE,
+      MIOPEN_SOFTMAX_MODE_INSTANCE));
+#else
+  cudnnTensorDescriptor_t cudnn_y_desc =
+      yDesc.descriptor<T>(layout, cudnn_tensor_dims);
+  cudnnTensorDescriptor_t cudnn_xgrad_desc =
+      dxDesc.descriptor<T>(layout, cudnn_tensor_dims);
+  cudnnTensorDescriptor_t cudnn_ygrad_desc =
+      dyDesc.descriptor<T>(layout, cudnn_tensor_dims);
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSoftmaxBackward(
+      GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace()),
+      CUDNN_SOFTMAX_ACCURATE,
+      CUDNN_SOFTMAX_MODE_INSTANCE,
+      CudnnDataType<T>::kOne(),
+      cudnn_y_desc,
+      Y->data<T>(),
+      cudnn_ygrad_desc,
+      YGrad->data<T>(),
+      CudnnDataType<T>::kZero(),
+      cudnn_xgrad_desc,
+      dev_ctx.template Alloc<T>(XGrad)));
+#endif
+}
+
+template class SoftmaxCUDNNFunctor<float, phi::GPUContext>;
+template class SoftmaxCUDNNFunctor<phi::dtype::float16, phi::GPUContext>;
+template class SoftmaxGradCUDNNFunctor<float, phi::GPUContext>;
+template class SoftmaxGradCUDNNFunctor<phi::dtype::float16, phi::GPUContext>;
+#if CUDNN_VERSION_MIN(8, 1, 0)
+template class SoftmaxCUDNNFunctor<phi::dtype::bfloat16, phi::GPUContext>;
+template class SoftmaxGradCUDNNFunctor<phi::dtype::bfloat16, phi::GPUContext>;
+#endif
+
+// MIOPEN do not support double
+#ifndef PADDLE_WITH_HIP
+template class SoftmaxCUDNNFunctor<double, phi::GPUContext>;
+template class SoftmaxGradCUDNNFunctor<double, phi::GPUContext>;
+#endif
+
+template class SoftmaxFunctor<phi::GPUContext, phi::dtype::float16>;
+template class SoftmaxFunctor<phi::GPUContext, phi::dtype::bfloat16>;
+template class SoftmaxFunctor<phi::GPUContext, float>;
+template class SoftmaxFunctor<phi::GPUContext, double>;
+template class SoftmaxGradFunctor<phi::GPUContext, float>;
+template class SoftmaxGradFunctor<phi::GPUContext, double>;
+template class SoftmaxGradFunctor<phi::GPUContext, phi::dtype::float16>;
+template class SoftmaxGradFunctor<phi::GPUContext, phi::dtype::bfloat16>;
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/backends/metax_gpu/kernels/metax_kernel/cross_entropy_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/cross_entropy_grad_kernel_register.cu
index b5de9dd8f3c..402f69a9958 100644
--- a/backends/metax_gpu/kernels/metax_kernel/cross_entropy_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/cross_entropy_grad_kernel_register.cu
@@ -149,11 +149,11 @@ void CrossEntropyWithSoftmaxGradGPUKernel(const GPUContext& dev_ctx,
                                           int ignore_index,
                                           int axis,
                                           DenseTensor* logits_grad) {
-  PADDLE_ENFORCE_EQ(
-      dev_ctx.GetPlace().GetType(),
-      phi::AllocationType::GPU,
-      common::errors::Unavailable("softmax_with_cross_entropy operator's "
-                                  "CUDA kernel only runs on GPU device."));
+  // PADDLE_ENFORCE_EQ(
+  //     dev_ctx.GetPlace().GetType(),
+  //     phi::AllocationType::GPU,
+  //     common::errors::Unavailable("softmax_with_cross_entropy operator's "
+  //                                 "CUDA kernel only runs on GPU device."));
   const T* loss_grad_data = loss_grad.data<T>();
   DenseTensor* logit_grad = logits_grad;
 
diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch
index 682cee35caf..1935217baa0 100755
--- a/backends/metax_gpu/patch/paddle.patch
+++ b/backends/metax_gpu/patch/paddle.patch
@@ -419,7 +419,7 @@ index d69eb67d6f..1d8b6e9375 100644
  #include "paddle/phi/kernels/funcs/math_function.h"
  
 diff --git a/paddle/phi/kernels/funcs/fc_functor.cu b/paddle/phi/kernels/funcs/fc_functor.cu
-index bdfd7313af..546bd07d5e 100644
+index cb35feee32..64f5bd24ac 100644
 --- a/paddle/phi/kernels/funcs/fc_functor.cu
 +++ b/paddle/phi/kernels/funcs/fc_functor.cu
 @@ -16,12 +16,12 @@ limitations under the License. */
@@ -438,7 +438,7 @@ index bdfd7313af..546bd07d5e 100644
  #include "paddle/phi/kernels/matmul_kernel.h"
  
 diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cu b/paddle/phi/kernels/funcs/matrix_inverse.cu
-index 1a9a9cfb85..08ebe4b8af 100644
+index e101224970..a52eb6096f 100644
 --- a/paddle/phi/kernels/funcs/matrix_inverse.cu
 +++ b/paddle/phi/kernels/funcs/matrix_inverse.cu
 @@ -15,11 +15,13 @@ limitations under the License. */
@@ -470,10 +470,10 @@ index 558d363b39..05da04b517 100644
  #include "paddle/phi/kernels/funcs/scatter.cu.h"
  
 diff --git a/paddle/phi/kernels/funcs/top_k_function_cuda.h b/paddle/phi/kernels/funcs/top_k_function_cuda.h
-index dc7935423c..84896c2214 100644
+index e30d440ff3..3c74792690 100644
 --- a/paddle/phi/kernels/funcs/top_k_function_cuda.h
 +++ b/paddle/phi/kernels/funcs/top_k_function_cuda.h
-@@ -32,11 +32,11 @@ limitations under the License. */
+@@ -30,11 +30,11 @@ limitations under the License. */
  #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
  #include "paddle/phi/kernels/primitive/functor_primitives.h"
  
@@ -487,7 +487,7 @@ index dc7935423c..84896c2214 100644
  #endif
  #define MAX_NUM_THREADS 1024
  
-@@ -200,21 +200,56 @@ __device__ __forceinline__ void AddTo(Pair<T> topk[],
+@@ -196,21 +196,56 @@ __device__ __forceinline__ void AddTo(Pair<T> topk[],
    for (int k = beam_size - 2; k >= 0; k--) {
      if (largest) {
        if (topk[k] < p) {
@@ -549,7 +549,7 @@ index dc7935423c..84896c2214 100644
  }
  
  template <typename T, int BlockSize>
-@@ -243,24 +278,24 @@ __device__ __forceinline__ void GetTopK(Pair<T> topk[],
+@@ -239,24 +274,24 @@ __device__ __forceinline__ void GetTopK(Pair<T> topk[],
  template <typename T, int BlockSize>
  __device__ __forceinline__ void GetTopK(Pair<T> topk[],
                                          const T* src,
@@ -581,7 +581,7 @@ index dc7935423c..84896c2214 100644
          }
        }
      }
-@@ -287,7 +322,9 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[],
+@@ -283,7 +318,9 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[],
      } else {
        for (int k = 0; k < MaxLength; k++) {
          if (k < MaxLength - (*beam)) {
@@ -592,7 +592,7 @@ index dc7935423c..84896c2214 100644
          } else {
            if (largest) {
              topk[k].set(-static_cast<T>(INFINITY), -1);
-@@ -297,8 +334,10 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[],
+@@ -293,8 +330,10 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[],
          }
        }
        if (!(*is_empty)) {
@@ -604,7 +604,7 @@ index dc7935423c..84896c2214 100644
        }
      }
  
-@@ -359,6 +398,8 @@ __device__ __forceinline__ void BlockReduce(Pair<T> shared_max[],
+@@ -355,6 +394,8 @@ __device__ __forceinline__ void BlockReduce(Pair<T> shared_max[],
        shared_max[wid] = input_now;
      }
      __syncthreads();
@@ -613,7 +613,7 @@ index dc7935423c..84896c2214 100644
      if (largest) {
        input_now = (tid < BlockSize / WARP_SIZE)
                        ? shared_max[lane]
-@@ -373,27 +414,32 @@ __device__ __forceinline__ void BlockReduce(Pair<T> shared_max[],
+@@ -369,27 +410,32 @@ __device__ __forceinline__ void BlockReduce(Pair<T> shared_max[],
        if (lane == 0) shared_max[0] = input_now;
      }
      __syncthreads();
@@ -652,7 +652,7 @@ index dc7935423c..84896c2214 100644
          break;
      }
    }
-@@ -482,16 +528,17 @@ struct Bitfield<unsigned int> {
+@@ -478,16 +524,17 @@ struct Bitfield<unsigned int> {
                                                               int pos,
                                                               int len) {
      unsigned int ret;
@@ -674,7 +674,7 @@ index dc7935423c..84896c2214 100644
      return ret;
    }
  };
-@@ -502,7 +549,9 @@ struct Bitfield<uint64_t> {
+@@ -498,7 +545,9 @@ struct Bitfield<uint64_t> {
                                                           int pos,
                                                           int len) {
      uint64_t ret;
@@ -685,7 +685,7 @@ index dc7935423c..84896c2214 100644
      return ret;
    }
  
-@@ -511,9 +560,9 @@ struct Bitfield<uint64_t> {
+@@ -507,9 +556,9 @@ struct Bitfield<uint64_t> {
                                                           int pos,
                                                           int len) {
      uint64_t ret;
@@ -698,7 +698,7 @@ index dc7935423c..84896c2214 100644
      return ret;
    }
  };
-@@ -631,14 +680,20 @@ struct RadixTypeConfig<phi::dtype::bfloat16> {
+@@ -627,14 +676,20 @@ struct RadixTypeConfig<phi::bfloat16> {
  /*---------------------------Helper Functions------------------*/
  __device__ __forceinline__ int GetLaneId() {
    int lane_id;
@@ -723,7 +723,7 @@ index dc7935423c..84896c2214 100644
  }
  
  template <typename T, bool KillDependency, class Function>
-@@ -885,7 +940,8 @@ __global__ void GatherKthValue(const T* input,
+@@ -881,7 +936,8 @@ __global__ void GatherKthValue(const T* input,
  
    // 1. Find the k-th value
    T kth_value = static_cast<T>(0);
@@ -733,13 +733,13 @@ index dc7935423c..84896c2214 100644
        cur_input, k, num_cols, shared_mem, &kth_value);
  
    __shared__ int64_t block_min_idx;
-@@ -1318,3 +1374,4 @@ bool SortTopk(const phi::GPUContext& dev_ctx,
+@@ -1314,3 +1370,4 @@ bool SortTopk(const phi::GPUContext& dev_ctx,
  }
  }  // namespace funcs
  }  // namespace phi
 +//
 diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h b/paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h
-index 45a29b4cff..8449e3d309 100644
+index 32db61532f..0220316bc3 100644
 --- a/paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h
 +++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h
 @@ -15,7 +15,7 @@
@@ -752,7 +752,7 @@ index 45a29b4cff..8449e3d309 100644
  
  #include "glog/logging.h"
 diff --git a/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h b/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h
-index 7d05bcb654..c79cdadabc 100644
+index 9d4bb18d55..ea42cc10a9 100644
 --- a/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h
 +++ b/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h
 @@ -638,9 +638,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
@@ -767,11 +767,11 @@ index 7d05bcb654..c79cdadabc 100644
          }
        }
 diff --git a/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu
-index ad04265bd6..59481d0e6a 100644
+index b8cfdbf3ce..fa14b94a77 100644
 --- a/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu
 +++ b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu
-@@ -15,7 +15,7 @@
- #include "paddle/phi/common/bfloat16.h"
+@@ -14,7 +14,7 @@
+ 
  #include "paddle/phi/core/kernel_registry.h"
  #include "paddle/phi/kernels/funcs/aligned_vector.h"
 -#include "paddle/phi/kernels/fusion/gpu/mmha_util.cu.h"
@@ -780,11 +780,11 @@ index ad04265bd6..59481d0e6a 100644
  namespace phi {
  namespace fusion {
 diff --git a/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu b/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu
-index 148d72ca9c..5da3461ebf 100644
+index e838778952..83e805e75a 100644
 --- a/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu
 +++ b/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu
-@@ -15,7 +15,7 @@
- #include "paddle/phi/common/bfloat16.h"
+@@ -14,7 +14,7 @@
+ 
  #include "paddle/phi/core/kernel_registry.h"
  #include "paddle/phi/kernels/funcs/aligned_vector.h"
 -#include "paddle/phi/kernels/fusion/gpu/mmha_util.cu.h"
@@ -793,7 +793,7 @@ index 148d72ca9c..5da3461ebf 100644
  namespace phi {
  namespace fusion {
 diff --git a/paddle/phi/kernels/gpu/depthwise_conv.h b/paddle/phi/kernels/gpu/depthwise_conv.h
-index b16553589a..90080c375d 100644
+index f0cca0f701..02ea957240 100644
 --- a/paddle/phi/kernels/gpu/depthwise_conv.h
 +++ b/paddle/phi/kernels/gpu/depthwise_conv.h
 @@ -29,8 +29,8 @@ namespace cub = hipcub;
@@ -833,7 +833,7 @@ index 29fa252e96..4ae72b0935 100644
  }
  
 diff --git a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu
-index ee71a2b452..69130ab955 100644
+index 11efd87965..679db14c24 100644
 --- a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu
 +++ b/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu
 @@ -17,7 +17,7 @@
@@ -846,7 +846,7 @@ index ee71a2b452..69130ab955 100644
  namespace phi {
  
 diff --git a/paddle/phi/kernels/gpu/log_softmax_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_kernel.cu
-index 00a2f1e210..1267cf7ec2 100644
+index 63c35dd4ee..15da9aea45 100644
 --- a/paddle/phi/kernels/gpu/log_softmax_kernel.cu
 +++ b/paddle/phi/kernels/gpu/log_softmax_kernel.cu
 @@ -17,7 +17,7 @@
@@ -872,7 +872,7 @@ index 1bdbe1564c..f753b54bc6 100644
  #include "paddle/phi/kernels/impl/tril_triu_kernel_impl.h"
  #include "paddle/phi/kernels/lstsq_kernel.h"
 diff --git a/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h b/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h
-index 14b24dd3ed..e54a342c98 100644
+index 9bc5326c90..79b57a8203 100644
 --- a/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h
 @@ -21,7 +21,7 @@ limitations under the License. */
@@ -885,7 +885,7 @@ index 14b24dd3ed..e54a342c98 100644
  #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
  #include "paddle/phi/kernels/funcs/for_range.h"
 diff --git a/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h b/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h
-index 06fff0dd58..973049105f 100644
+index cf80666b4e..ca76e055fb 100644
 --- a/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h
 @@ -19,7 +19,7 @@ limitations under the License. */
@@ -1041,461 +1041,12 @@ index 4099d8b506..baef2cd643 100644
  #include "paddle/phi/kernels/funcs/eigen/common.h"
  #include "paddle/phi/kernels/funcs/math_function.h"
  
-diff --git a/test/legacy_test/test_batch_norm_op.py b/test/legacy_test/test_batch_norm_op.py
-index 4a5660ea0e..ca4e456e02 100644
---- a/test/legacy_test/test_batch_norm_op.py
-+++ b/test/legacy_test/test_batch_norm_op.py
-@@ -22,7 +22,9 @@ from op_test import (
-     _set_use_system_allocator,
-     convert_float_to_uint16,
-     convert_uint16_to_float,
--    get_places,
-+    get_devices,
-+    is_custom_device,
-+    get_device_place,
- )
- 
- import paddle
-@@ -189,6 +191,7 @@ def _reference_grad(x, y_grad, scale, mean, var, epsilon, data_format):
- 
- 
- def create_or_get_tensor(scope, var_name, var, place):
-+    
-     tensor = scope.var(var_name).get_tensor()
-     if var is not None:
-         assert isinstance(var, np.ndarray)
-@@ -321,7 +324,6 @@ class TestBatchNormOpInference(unittest.TestCase):
-             fuse_with_relu=self.fuse_with_relu,
-             epsilon=epsilon,
-         )
--
-         batch_norm_op.run(scope, place)
- 
-         # When op is called without Executor then
-@@ -454,7 +456,7 @@ class TestBatchNormOpInference(unittest.TestCase):
-         )
- 
-     def test_check_output(self):
--        for place in get_places():
-+        for place in get_devices():
-             for data_format in ["NCHW", "NHWC"]:
-                 self.check_with_place(
-                     place,
-@@ -488,8 +490,8 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference):
- 
-     def test_check_output(self):
-         places = []
--        if core.is_compiled_with_cuda():
--            place = core.CUDAPlace(0)
-+        if core.is_compiled_with_cuda() or is_custom_device():
-+            place = get_device_place()
-             if core.is_float16_supported(place):
-                 places.append(place)
-         for place in places:
-@@ -510,8 +512,8 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference):
- 
- 
- @unittest.skipIf(
--    not core.is_compiled_with_cuda()
--    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
-+    not (core.is_compiled_with_cuda() or is_custom_device())
-+    or not core.is_bfloat16_supported(get_device_place()),
-     "core is not compiled with CUDA or not support the bfloat16",
- )
- class TestBF16BatchNormOpInference(TestBatchNormOpInference):
-@@ -522,7 +524,7 @@ class TestBF16BatchNormOpInference(TestBatchNormOpInference):
-         self.init_kernel_type()
- 
-     def test_check_output(self):
--        places = [core.CUDAPlace(0)]
-+        places = [get_device_place()]
-         for place in places:
-             # for data_format in ["NCHW", "NHWC"]:
-             for data_format in ["NCHW"]:
-@@ -562,7 +564,7 @@ class TestDygraphBatchNormAPIError(unittest.TestCase):
- 
- class TestDygraphBatchNormTrainableStats(unittest.TestCase):
-     def test_dygraph(self):
--        for p in get_places():
-+        for p in get_devices():
-             shape = [4, 10, 4, 4]
- 
-             def compute(x, is_test, trainable_statistics):
-@@ -581,7 +583,7 @@ class TestDygraphBatchNormTrainableStats(unittest.TestCase):
-             np.testing.assert_allclose(y1, y2, rtol=1e-05)
- 
-     def test_static(self):
--        for p in get_places():
-+        for p in get_devices():
-             exe = base.Executor(p)
-             shape = [4, 10, 16, 16]
- 
-@@ -625,7 +627,7 @@ class TestDygraphBatchNormOpenReserveSpace(unittest.TestCase):
- 
- class TestBatchNormAPI_ZeroSize(unittest.TestCase):
-     def setUp(self):
--        self.places = get_places()
-+        self.places = get_devices()
- 
-     def test_dygraph(self):
-         for place in self.places:
-diff --git a/test/legacy_test/test_conv3d_transpose_op.py b/test/legacy_test/test_conv3d_transpose_op.py
-index c9853e9073..277eb26d00 100644
---- a/test/legacy_test/test_conv3d_transpose_op.py
-+++ b/test/legacy_test/test_conv3d_transpose_op.py
-@@ -19,7 +19,7 @@ import numpy as np
- import paddle
- 
- paddle.enable_static()
--from op_test import OpTest, copy_bits_from_float_to_uint16
-+from op_test import OpTest, copy_bits_from_float_to_uint16, is_custom_device, get_devices, get_device_place
- 
- from paddle.base import core
- 
-@@ -150,7 +150,7 @@ def conv3dtranspose_forward_naive(input_, filter_, attrs):
- 
- def create_test_cudnn_fp16_class(parent, grad_check=True):
-     @unittest.skipIf(
--        not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-+        not ((core.is_compiled_with_cuda() or is_custom_device()) or is_custom_device()), "core is not compiled with CUDA"
-     )
-     class TestConv3DTransposeCUDNNFP16(parent):
-         def init_kernel_type(self):
-@@ -158,20 +158,20 @@ def create_test_cudnn_fp16_class(parent, grad_check=True):
-             self.dtype = np.float16
- 
-         def test_check_output(self):
--            if core.is_compiled_with_cuda():
--                place = core.CUDAPlace(0)
-+            if ((core.is_compiled_with_cuda() or is_custom_device()) or is_custom_device()):
-+                place = get_device_place()
-                 if core.is_float16_supported(place):
-                     self.check_output_with_place(place, atol=2e-2)
- 
-         def test_check_grad_no_filter(self):
--            place = core.CUDAPlace(0)
-+            place = get_device_place()
-             if core.is_float16_supported(place) and grad_check:
-                 self.check_grad_with_place(
-                     place, ['Input'], 'Output', no_grad_set={'Filter'}
-                 )
- 
-         def test_check_grad_no_input(self):
--            place = core.CUDAPlace(0)
-+            place = get_device_place()
-             if core.is_float16_supported(place) and grad_check:
-                 self.check_grad_with_place(
-                     place, ['Filter'], 'Output', no_grad_set={'Input'}
-@@ -184,8 +184,8 @@ def create_test_cudnn_fp16_class(parent, grad_check=True):
- 
- def create_test_cudnn_bf16_class(parent):
-     @unittest.skipIf(
--        not core.is_compiled_with_cuda()
--        or not core.is_bfloat16_supported(core.CUDAPlace(0)),
-+        not (core.is_compiled_with_cuda() or is_custom_device())
-+        or not core.is_bfloat16_supported(get_device_place()),
-         "core is not compiled with CUDA and do not support bfloat16",
-     )
-     class TestConv3DTransposeCUDNNBF16(parent):
-@@ -194,11 +194,11 @@ def create_test_cudnn_bf16_class(parent):
-             self.dtype = np.uint16
- 
-         def test_check_output(self):
--            place = core.CUDAPlace(0)
-+            place = get_device_place()
-             self.check_output_with_place(place)
- 
-         def test_check_grad(self):
--            place = core.CUDAPlace(0)
-+            place = get_device_place()
-             self.check_grad_with_place(
-                 place,
-                 {'Input', 'Filter'},
-@@ -206,7 +206,7 @@ def create_test_cudnn_bf16_class(parent):
-             )
- 
-         def test_check_grad_no_filter(self):
--            place = core.CUDAPlace(0)
-+            place = get_device_place()
-             self.check_grad_with_place(
-                 place,
-                 ['Input'],
-@@ -215,7 +215,7 @@ def create_test_cudnn_bf16_class(parent):
-             )
- 
-         def test_check_grad_no_input(self):
--            place = core.CUDAPlace(0)
-+            place = get_device_place()
-             self.check_grad_with_place(
-                 place,
-                 ['Filter'],
-@@ -306,14 +306,14 @@ class TestConv3DTransposeOp(OpTest):
- 
-     def test_check_output(self):
-         if self.use_cudnn:
--            place = core.CUDAPlace(0)
-+            place = get_device_place()
-             self.check_output_with_place(place, atol=1e-5)
-         else:
-             self.check_output()
- 
-     def test_check_grad(self):
-         if self.use_cudnn:
--            place = core.CUDAPlace(0)
-+            place = get_device_place()
-             self.check_grad_with_place(
-                 place,
-                 {'Input', 'Filter'},
-@@ -327,7 +327,7 @@ class TestConv3DTransposeOp(OpTest):
- 
-     def test_check_grad_no_filter(self):
-         if self.use_cudnn:
--            place = core.CUDAPlace(0)
-+            place = get_device_place()
-             self.check_grad_with_place(
-                 place,
-                 ['Input'],
-@@ -345,7 +345,7 @@ class TestConv3DTransposeOp(OpTest):
- 
-     def test_check_grad_no_input(self):
-         if self.use_cudnn:
--            place = core.CUDAPlace(0)
-+            place = get_device_place()
-             self.check_grad_with_place(
-                 place,
-                 ['Filter'],
-@@ -471,7 +471,7 @@ class Test_NHWC(TestConv3DTransposeOp):
- 
- # ------------ test_cudnn ------------
- @unittest.skipIf(
--    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-+    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
- )
- class TestCUDNN(TestConv3DTransposeOp):
-     def init_op_type(self):
-@@ -481,7 +481,7 @@ class TestCUDNN(TestConv3DTransposeOp):
- 
- 
- @unittest.skipIf(
--    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-+    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
- )
- class TestCUDNNWithSymmetricPad(TestWithSymmetricPad):
-     def init_test_case(self):
-@@ -500,7 +500,7 @@ class TestCUDNNWithSymmetricPad(TestWithSymmetricPad):
- 
- 
- @unittest.skipIf(
--    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-+    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
- )
- class TestCUDNNWithAsymmetricPad(TestWithAsymmetricPad):
-     def init_test_case(self):
-@@ -519,7 +519,7 @@ class TestCUDNNWithAsymmetricPad(TestWithAsymmetricPad):
- 
- 
- @unittest.skipIf(
--    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-+    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
- )
- class TestCUDNNWithSAMEPad(TestWithSAMEPad):
-     def init_test_case(self):
-@@ -538,7 +538,7 @@ class TestCUDNNWithSAMEPad(TestWithSAMEPad):
- 
- 
- @unittest.skipIf(
--    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-+    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
- )
- class TestCUDNNWithVALIDPad(TestWithVALIDPad):
-     def init_test_case(self):
-@@ -557,7 +557,7 @@ class TestCUDNNWithVALIDPad(TestWithVALIDPad):
- 
- 
- @unittest.skipIf(
--    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-+    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
- )
- class TestCUDNNWithStride(TestWithStride):
-     def init_test_case(self):
-@@ -576,7 +576,7 @@ class TestCUDNNWithStride(TestWithStride):
- 
- 
- @unittest.skipIf(
--    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-+    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
- )
- class TestCUDNNWithGroups(TestWithGroups):
-     def init_test_case(self):
-@@ -610,7 +610,7 @@ class TestCUDNNWithGroups(TestWithGroups):
- 
- 
- @unittest.skipIf(
--    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-+    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
- )
- class TestCUDNN_NHWC(TestConv3DTransposeOp):
-     def init_test_case(self):
-@@ -630,7 +630,7 @@ class TestCUDNN_NHWC(TestConv3DTransposeOp):
- 
- 
- @unittest.skipIf(
--    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-+    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
- )
- class TestCUDNNWithSymmetricPad_NHWC(TestWithSymmetricPad):
-     def init_test_case(self):
-@@ -650,7 +650,7 @@ class TestCUDNNWithSymmetricPad_NHWC(TestWithSymmetricPad):
- 
- 
- @unittest.skipIf(
--    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-+    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
- )
- class TestCUDNNWithAsymmetricPad_NHWC(TestWithAsymmetricPad):
-     def init_test_case(self):
-@@ -670,7 +670,7 @@ class TestCUDNNWithAsymmetricPad_NHWC(TestWithAsymmetricPad):
- 
- 
- @unittest.skipIf(
--    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-+    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
- )
- class TestCUDNNWithStride_NHWC(TestWithStride):
-     def init_test_case(self):
-@@ -690,7 +690,7 @@ class TestCUDNNWithStride_NHWC(TestWithStride):
- 
- 
- @unittest.skipIf(
--    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-+    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
- )
- class TestCUDNNWithGroups_NHWC(TestWithGroups):
-     def init_test_case(self):
-diff --git a/test/legacy_test/test_cross_entropy_op.py b/test/legacy_test/test_cross_entropy_op.py
-index 74eedb6a48..e4c6ecb98a 100644
---- a/test/legacy_test/test_cross_entropy_op.py
-+++ b/test/legacy_test/test_cross_entropy_op.py
-@@ -20,6 +20,8 @@ from op_test import (
-     get_places,
-     paddle_static_guard,
-     randomize_probability,
-+    is_custom_device,
-+    get_device_place,
- )
- 
- import paddle
-@@ -385,19 +387,19 @@ class TestCrossEntropyOp7RemoveLastDim(TestCrossEntropyOp7):
- # Add Fp16 test
- def create_test_class(parent, cls_name):
-     @unittest.skipIf(
--        not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-+        not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
-     )
-     class TestCrossEntropyFP16Op(parent):
-         def init_dtype_type(self):
-             return np.float16
- 
-         def test_check_output(self):
--            place = core.CUDAPlace(0)
-+            place = get_device_place()
-             if core.is_float16_supported(place):
-                 self.check_output_with_place(place, atol=2e-1)
- 
-         def test_check_grad(self):
--            place = core.CUDAPlace(0)
-+            place = get_device_place()
-             if core.is_float16_supported(place):
-                 self.check_grad_with_place(
-                     place, ['X'], 'Y', max_relative_error=0.9
-diff --git a/test/legacy_test/test_fmin_op.py b/test/legacy_test/test_fmin_op.py
-index 4c9944e877..e6ed5c0f8e 100644
---- a/test/legacy_test/test_fmin_op.py
-+++ b/test/legacy_test/test_fmin_op.py
-@@ -15,8 +15,7 @@
- import unittest
- 
- import numpy as np
--from op_test import OpTest, convert_float_to_uint16
--
-+from op_test import OpTest, convert_float_to_uint16, is_custom_device, get_devices, get_device_place
- import paddle
- from paddle.base import core
- 
-@@ -28,8 +27,8 @@ class ApiFMinTest(unittest.TestCase):
- 
-     def setUp(self):
-         """setUp"""
--        if core.is_compiled_with_cuda():
--            self.place = core.CUDAPlace(0)
-+        if core.is_compiled_with_cuda() or is_custom_device():
-+            self.place = get_device_place()
-         else:
-             self.place = core.CPUPlace()
- 
-@@ -259,8 +258,8 @@ class TestElementwiseFmin3Op(OpTest):
- 
- 
- @unittest.skipIf(
--    not core.is_compiled_with_cuda()
--    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
-+    not (core.is_compiled_with_cuda() or is_custom_device())
-+    or not core.is_bfloat16_supported(get_device_place()),
-     "core is not compiled with CUDA and not support the bfloat16",
- )
- class TestFminBF16OP(OpTest):
-@@ -281,13 +280,13 @@ class TestFminBF16OP(OpTest):
-         self.outputs = {'Out': convert_float_to_uint16(out)}
- 
-     def test_check_output(self):
--        place = core.CUDAPlace(0)
-+        place = get_device_place()
-         self.check_output_with_place(
-             place, check_pir=True, check_symbol_infer=False
-         )
- 
-     def test_check_grad(self):
--        place = core.CUDAPlace(0)
-+        place = get_device_place()
-         self.check_grad_with_place(
-             place, ['X', 'Y'], 'Out', check_pir=True, check_prim_pir=True
-         )
-@@ -304,7 +303,7 @@ class TestElementwiseFminOpZeroSize1(TestElementwiseFminOp):
- 
- 
- @unittest.skipIf(
--    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-+    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
- )
- class TestElementwiseFminOp_Stride(OpTest):
-     no_need_check_grad = True
-@@ -335,7 +334,7 @@ class TestElementwiseFminOp_Stride(OpTest):
-         self.val_dtype = np.float64
- 
-     def test_check_output(self):
--        place = core.CUDAPlace(0)
-+        place = get_device_place()
-         self.check_strided_forward = True
-         self.check_output(
-             place,
-diff --git a/test/legacy_test/test_spectral_norm_op.py b/test/legacy_test/test_spectral_norm_op.py
-index 80e5c2ec63..f1602a8b40 100644
---- a/test/legacy_test/test_spectral_norm_op.py
-+++ b/test/legacy_test/test_spectral_norm_op.py
-@@ -112,6 +112,7 @@ class TestSpectralNormOpNoGrad2(TestSpectralNormOpNoGrad):
- 
- class TestSpectralNormOp(TestSpectralNormOpNoGrad):
-     def test_check_grad_ignore_uv(self):
-+        
-         self.check_grad(
-             ['Weight'],
-             'Out',
 diff --git a/third_party/flagcx b/third_party/flagcx
-index 77495cd6a8..7e6c4cc3ca 160000
+index 7c469f4af9..7e6c4cc3ca 160000
 --- a/third_party/flagcx
 +++ b/third_party/flagcx
 @@ -1 +1 @@
--Subproject commit 77495cd6a84b1c8f88dd8f6f99e63ef3c84c766f
+-Subproject commit 7c469f4af991bf0f64b8f76d66f8e307a5eaea3f
 +Subproject commit 7e6c4cc3cad3fce9b3dedfe46a9d195d616e8ffa
 diff --git a/third_party/flashattn b/third_party/flashattn
 index 581e48aa69..749aca3807 160000
diff --git a/backends/metax_gpu/runtime/process_cupti_data.cc b/backends/metax_gpu/runtime/process_cupti_data.cc
index 65011e3f58d..94caca5d8cb 100755
--- a/backends/metax_gpu/runtime/process_cupti_data.cc
+++ b/backends/metax_gpu/runtime/process_cupti_data.cc
@@ -226,52 +226,126 @@ class CuptiRuntimeCbidStr {
 CuptiRuntimeCbidStr::CuptiRuntimeCbidStr() {
 #define REGISTER_RUNTIME_CBID_STR(cbid) \
   cbid_str_[CUPTI_RUNTIME_TRACE_CBID_##cbid] = #cbid
-  REGISTER_RUNTIME_CBID_STR(cudaBindTexture_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaConfigureCall_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaDeviceGetAttribute_v5000);
-  REGISTER_RUNTIME_CBID_STR(cudaDeviceGetStreamPriorityRange_v5050);
-  REGISTER_RUNTIME_CBID_STR(cudaDeviceSynchronize_v3020);
   REGISTER_RUNTIME_CBID_STR(cudaDriverGetVersion_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaEventCreateWithFlags_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaEventDestroy_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaEventDestroy_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaEventQuery_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaEventRecord_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaFreeHost_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaFree_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaFuncGetAttributes_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaRuntimeGetVersion_v3020);
   REGISTER_RUNTIME_CBID_STR(cudaGetDeviceCount_v3020);
   REGISTER_RUNTIME_CBID_STR(cudaGetDeviceProperties_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaGetDevice_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaGetErrorString_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaChooseDevice_v3020);
   REGISTER_RUNTIME_CBID_STR(cudaGetLastError_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaPeekAtLastError_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaLaunch_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaFuncSetCacheConfig_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaFuncGetAttributes_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaSetDevice_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaGetDevice_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaSetValidDevices_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaSetDeviceFlags_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMalloc_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMallocPitch_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaFree_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMallocArray_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaFreeArray_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMallocHost_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaFreeHost_v3020);
   REGISTER_RUNTIME_CBID_STR(cudaHostAlloc_v3020);
   REGISTER_RUNTIME_CBID_STR(cudaHostGetDevicePointer_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaLaunchKernel_v7000);
-  REGISTER_RUNTIME_CBID_STR(cudaMallocHost_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaMalloc_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaMemcpyAsync_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaHostGetFlags_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemGetInfo_v3020);
   REGISTER_RUNTIME_CBID_STR(cudaMemcpy_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaMemsetAsync_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemcpy2D_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemcpyToArray_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemcpy2DToArray_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemcpyToSymbol_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemcpyFromSymbol_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemcpyAsync_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemcpy2DAsync_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemcpyToSymbolAsync_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemcpyFromSymbolAsync_v3020);
   REGISTER_RUNTIME_CBID_STR(cudaMemset_v3020);
-  REGISTER_RUNTIME_CBID_STR(
-      cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000);
-  REGISTER_RUNTIME_CBID_STR(cudaPeekAtLastError_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaRuntimeGetVersion_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaSetDevice_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemset2D_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemsetAsync_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemset2DAsync_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaGetSymbolAddress_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaGetSymbolSize_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaBindTexture_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaBindTexture2D_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaBindTextureToArray_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaUnbindTexture_v3020);
   REGISTER_RUNTIME_CBID_STR(cudaStreamCreate_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaStreamCreateWithFlags_v5000);
-  REGISTER_RUNTIME_CBID_STR(cudaStreamCreateWithPriority_v5050);
-  REGISTER_RUNTIME_CBID_STR(cudaStreamDestroy_v5050);
+  REGISTER_RUNTIME_CBID_STR(cudaStreamDestroy_v3020);
   REGISTER_RUNTIME_CBID_STR(cudaStreamSynchronize_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaStreamQuery_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaEventCreate_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaEventCreateWithFlags_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaEventRecord_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaEventDestroy_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaEventSynchronize_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaEventQuery_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaEventElapsedTime_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMalloc3D_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMalloc3DArray_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemset3D_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemset3DAsync_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemcpy3D_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemcpy3DAsync_v3020);
   REGISTER_RUNTIME_CBID_STR(cudaStreamWaitEvent_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaUnbindTexture_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaSetupArgument_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaLaunch_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaPointerGetAttributes_v4000);
+  REGISTER_RUNTIME_CBID_STR(cudaHostRegister_v4000);
+  REGISTER_RUNTIME_CBID_STR(cudaHostUnregister_v4000);
+  REGISTER_RUNTIME_CBID_STR(cudaDeviceCanAccessPeer_v4000);
+  REGISTER_RUNTIME_CBID_STR(cudaDeviceEnablePeerAccess_v4000);
+  REGISTER_RUNTIME_CBID_STR(cudaDeviceDisablePeerAccess_v4000);
+  REGISTER_RUNTIME_CBID_STR(cudaMemcpyPeer_v4000);
+  REGISTER_RUNTIME_CBID_STR(cudaMemcpyPeerAsync_v4000);
+  REGISTER_RUNTIME_CBID_STR(cudaMemcpy3DPeer_v4000);
+  REGISTER_RUNTIME_CBID_STR(cudaMemcpy3DPeerAsync_v4000);
+  REGISTER_RUNTIME_CBID_STR(cudaDeviceReset_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaDeviceSynchronize_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaDeviceGetLimit_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaDeviceSetLimit_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaDeviceGetCacheConfig_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaDeviceSetCacheConfig_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaProfilerInitialize_v4000);
+  REGISTER_RUNTIME_CBID_STR(cudaProfilerStart_v4000);
+  REGISTER_RUNTIME_CBID_STR(cudaProfilerStop_v4000);
+  REGISTER_RUNTIME_CBID_STR(cudaDeviceGetByPCIBusId_v4010);
   REGISTER_RUNTIME_CBID_STR(cudaDeviceGetPCIBusId_v4010);
+  REGISTER_RUNTIME_CBID_STR(cudaIpcGetEventHandle_v4010);
+  REGISTER_RUNTIME_CBID_STR(cudaIpcOpenEventHandle_v4010);
+  REGISTER_RUNTIME_CBID_STR(cudaIpcGetMemHandle_v4010);
+  REGISTER_RUNTIME_CBID_STR(cudaIpcOpenMemHandle_v4010);
+  REGISTER_RUNTIME_CBID_STR(cudaIpcCloseMemHandle_v4010);
+  REGISTER_RUNTIME_CBID_STR(cudaFuncSetSharedMemConfig_v4020);
+  REGISTER_RUNTIME_CBID_STR(cudaDeviceGetSharedMemConfig_v4020);
+  REGISTER_RUNTIME_CBID_STR(cudaDeviceSetSharedMemConfig_v4020);
+  REGISTER_RUNTIME_CBID_STR(cudaStreamAddCallback_v5000);
+  REGISTER_RUNTIME_CBID_STR(cudaStreamCreateWithFlags_v5000);
+  REGISTER_RUNTIME_CBID_STR(cudaDeviceGetAttribute_v5000);
+  REGISTER_RUNTIME_CBID_STR(cudaStreamDestroy_v5050);
+  REGISTER_RUNTIME_CBID_STR(cudaStreamCreateWithPriority_v5050);
+  REGISTER_RUNTIME_CBID_STR(cudaStreamGetPriority_v5050);
+  REGISTER_RUNTIME_CBID_STR(cudaStreamGetFlags_v5050);
+  REGISTER_RUNTIME_CBID_STR(cudaDeviceGetStreamPriorityRange_v5050);
+  REGISTER_RUNTIME_CBID_STR(cudaMallocManaged_v6000);
+  REGISTER_RUNTIME_CBID_STR(
+      cudaOccupancyMaxActiveBlocksPerMultiprocessor_v6000);
+  REGISTER_RUNTIME_CBID_STR(cudaStreamAttachMemAsync_v6000);
+  REGISTER_RUNTIME_CBID_STR(
+      cudaOccupancyMaxActiveBlocksPerMultiprocessor_v6050);
+  REGISTER_RUNTIME_CBID_STR(cudaLaunchKernel_v7000);
+  REGISTER_RUNTIME_CBID_STR(cudaGetDeviceFlags_v7000);
+  REGISTER_RUNTIME_CBID_STR(
+      cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000);
+  REGISTER_RUNTIME_CBID_STR(cudaMemRangeGetAttribute_v8000);
+  REGISTER_RUNTIME_CBID_STR(cudaMemRangeGetAttributes_v8000);
 #if CUDA_VERSION >= 9000
   REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernel_v9000);
   REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernelMultiDevice_v9000);
+  REGISTER_RUNTIME_CBID_STR(cudaFuncSetAttribute_v9000);
+  REGISTER_RUNTIME_CBID_STR(cudaGraphLaunch_v10000);
+  REGISTER_RUNTIME_CBID_STR(cudaStreamSetAttribute_v11000);
+  REGISTER_RUNTIME_CBID_STR(cudaMallocAsync_v11020);
+  REGISTER_RUNTIME_CBID_STR(cudaFreeAsync_v11020);
 #endif
 #undef REGISTER_RUNTIME_CBID_STR
 }

From 352f02e869be9bccd1c9d154d2c70151626a43ea Mon Sep 17 00:00:00 2001
From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Date: Tue, 9 Sep 2025 16:45:38 +0800
Subject: [PATCH 003/121] [Metax] fix dgc & mklml compile product path problem
 (#8)

---
 backends/metax_gpu/CMakeLists.txt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index 5022e1bdde3..beb442eadad 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -26,6 +26,10 @@ set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake")
 message(STATUS "CMAKE_MODULE_PATH: ${CMAKE_MODULE_PATH}")
 set(WITH_MKLML ON)
 
+set(THIRD_PARTY_PATH
+    "${PADDLE_SOURCE_DIR}/build/third_party"
+    CACHE PATH "Third party libraries directory.")
+
 include(paddle)
 include(version)
 include(generic)
@@ -52,10 +56,6 @@ option(ON_INFER "compile with inference c++ lib" OFF)
 option(WITH_GPU "Compile PaddlePaddle with METAX_GPU" ON)
 option(WITH_CUSTOM_DEVICE "Compile PaddlePaddle with CUSTOM_DEVICE" ON)
 
-set(THIRD_PARTY_PATH
-    "${PADDLE_SOURCE_DIR}/build/third_party"
-    CACHE PATH "Third party libraries directory.")
-
 macro(UNSET_VAR VAR_NAME)
   unset(${VAR_NAME} CACHE)
   unset(${VAR_NAME})

From 8f13faed41890653f7f57328674c672c77dcfa4c Mon Sep 17 00:00:00 2001
From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Date: Thu, 11 Sep 2025 17:18:33 +0800
Subject: [PATCH 004/121] [Metax] fix accuracy kernel & add
 test_accuracy_op_metax.py unit test (#9)

* [Metax] fix dgc & mklml compile product path problem

* [Metax] fix accuracy kernel & add test_accuracy_op_metax.py unit test

* [Metax] add mixed_vector fix & update change patch
---
 backends/metax_gpu/CMakeLists.txt             |   2 +-
 backends/metax_gpu/build.sh                   |  26 +-
 backends/metax_gpu/build_in_metax.sh          |  17 +-
 backends/metax_gpu/change_patch.sh            |   9 +-
 .../cuda_kernels/accuracy_kernel_register.cu  | 141 ++-
 backends/metax_gpu/patch/tmp/mixed_vector.cc  | 111 ++
 backends/metax_gpu/patch/tmp/mixed_vector.h   | 413 ++++++++
 .../tests/unittest/test_accuracy_op_metax.py  | 206 ++++
 .../tests/unittest/test_gather_op_metax.py    | 983 +++++++++++++++---
 9 files changed, 1740 insertions(+), 168 deletions(-)
 create mode 100644 backends/metax_gpu/patch/tmp/mixed_vector.cc
 create mode 100644 backends/metax_gpu/patch/tmp/mixed_vector.h
 create mode 100644 backends/metax_gpu/tests/unittest/test_accuracy_op_metax.py

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index beb442eadad..4567723123c 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -128,7 +128,7 @@ file(
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/arange_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/adadelta_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/accuracy_check_kernel.cu
-  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/accuracy_kernel.cu
+  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/accuracy_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/allclose_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/all_gather_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/all_reduce_kernel.cu
diff --git a/backends/metax_gpu/build.sh b/backends/metax_gpu/build.sh
index 0350a32521f..dd0ab3aab90 100755
--- a/backends/metax_gpu/build.sh
+++ b/backends/metax_gpu/build.sh
@@ -2,13 +2,13 @@
 #!/bin/bash
 
 # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -31,25 +31,7 @@ git submodule sync --recursive && git submodule update --init --recursive
 
 
 # apply patch
-
-rm -r ../../Paddle/third_party/eigen3
-
-
-cd patch 
-
-unzip mcEigen_3.4.0_paddle_final.zip
-
-mv mcEigen_3.4.0_paddle_final eigen3
-
-cd ..
-
-cp -r patch/eigen3/ ../../Paddle/third_party/eigen3
-
-cd ../../Paddle/
-
-git apply --verbose ../backends/metax_gpu/patch/paddle.patch
-
-cd -
+bash change_patch.sh
 
 
 export MACA_PATH=/opt/maca
diff --git a/backends/metax_gpu/build_in_metax.sh b/backends/metax_gpu/build_in_metax.sh
index b1f9d63d85c..67ec1a2c31c 100644
--- a/backends/metax_gpu/build_in_metax.sh
+++ b/backends/metax_gpu/build_in_metax.sh
@@ -2,13 +2,13 @@
 #!/bin/bash
 
 # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,16 +22,7 @@ git submodule sync --recursive && git submodule update --init --recursive
 
 
 # apply patch
-
-rm -r ../../Paddle/third_party/eigen3
-cd patch 
-unzip mcEigen_3.4.0_paddle_final.zip
-mv mcEigen_3.4.0_paddle_final eigen3
-cd ..
-cp -r patch/eigen3/ ../../Paddle/third_party/eigen3
-cd ../../Paddle/
-git apply --verbose ../backends/metax_gpu/patch/paddle.patch
-cd -
+bash change_patch.sh
 
 export MACA_PATH=/opt/maca
 export CUDA_PATH=/workspace/cuda-11.7/
diff --git a/backends/metax_gpu/change_patch.sh b/backends/metax_gpu/change_patch.sh
index 58bda1aacd4..833ae00f6bd 100644
--- a/backends/metax_gpu/change_patch.sh
+++ b/backends/metax_gpu/change_patch.sh
@@ -2,13 +2,13 @@
 #!/bin/bash
 
 # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -16,11 +16,12 @@
 # limitations under the License.
 
 rm -r ../../Paddle/third_party/eigen3
-cd patch 
+cd patch
 unzip mcEigen_3.4.0_paddle_final.zip
 mv mcEigen_3.4.0_paddle_final eigen3
 cd ..
 cp -r patch/eigen3/ ../../Paddle/third_party/eigen3
+cp patch/tmp/mixed_vector* ../../Paddle/paddle/phi/core
 cd ../../Paddle/
 git apply --verbose ../backends/metax_gpu/patch/paddle.patch
 cd -
diff --git a/backends/metax_gpu/kernels/cuda_kernels/accuracy_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/accuracy_kernel_register.cu
index 1b26e5711ac..0d61c79d0fa 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/accuracy_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/accuracy_kernel_register.cu
@@ -1,7 +1,7 @@
 // 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights
 // Reserved.
 
-// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -14,19 +14,150 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+
+#include <thrust/execution_policy.h>
+#include <thrust/reduce.h>
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
+#include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/accuracy_kernel.h"
 
+namespace phi {
+using phi::PADDLE_CUDA_NUM_THREADS;
+
+template <int BlockSize, typename T>
+__global__ void AccuracyCudaKernel(const int N,
+                                   const int D,
+                                   const int64_t* Xdata,
+                                   const int64_t* labeldata,
+                                   int* correct_data,
+                                   T* accuracy,
+                                   int* total_data) {
+  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+  int count = 0;
+  __shared__ int total[BlockSize];
+
+  // support only 1 block
+  for (int i = threadIdx.x; i < (N); i += BlockSize) {
+    for (int j = 0; j < D; ++j) {
+      if (Xdata[i * D + j] == labeldata[i]) {
+        ++count;
+        break;
+      }
+    }
+  }
+  total[threadIdx.x] = count;
+  __syncthreads();
+
+  // reduce the count with init value 0, and output accuracy.
+  // #ifdef PADDLE_WITH_CUDA
+  //   int result = thrust::reduce(thrust::device, total, total + BlockSize, 0);
+  // #else
+  // HIP thrust::reduce not support __device__
+  for (int s = BlockSize / 2; s > 0; s >>= 1) {
+    if (threadIdx.x < s) {
+      total[threadIdx.x] += total[threadIdx.x + s];
+    }
+    __syncthreads();
+  }
+  int result = total[0];
+  // #endif
+  if (threadIdx.x == 0) {
+    *correct_data = result;
+    *accuracy = static_cast<T>(static_cast<MT>(result) / static_cast<MT>(N));
+    *total_data = N;
+  }
+}
+
+template <typename T, typename Context>
+void AccuracyKernel(const Context& dev_ctx,
+                    const DenseTensor& inference,
+                    const DenseTensor& indices,
+                    const DenseTensor& label,
+                    DenseTensor* accuracy,
+                    DenseTensor* correct,
+                    DenseTensor* total) {
+  // FIXME(typhoonzero): only support indices currently
+  // if add support for output values, how to detect the data type?
+  const int64_t* indices_data = indices.data<int64_t>();
+  const int64_t* label_data = label.data<int64_t>();
+
+  PADDLE_ENFORCE_EQ(
+      inference.dims().size(),
+      2,
+      common::errors::InvalidArgument(
+          "Rank(Input) of AccuracyOp must be 2, with shape "
+          "[sample_number, class_dim], But received rank(Input) is %d",
+          inference.dims().size()));
+
+  int* correct_data = dev_ctx.template Alloc<int>(correct);
+  int* total_data = dev_ctx.template Alloc<int>(total);
+  T* accuracy_data = dev_ctx.template Alloc<T>(accuracy);
+
+  int num_samples = static_cast<int>(inference.dims()[0]);
+  size_t infer_width = inference.dims()[1];
+  auto stream = dev_ctx.stream();
+  phi::backends::gpu::GpuMemsetAsync(accuracy_data, 0, sizeof(T), stream);
+
+  PADDLE_ENFORCE_GT(label.dims().size(),
+                    0,
+                    common::errors::InvalidArgument(
+                        "Rank(Label) of AccuracyOp must greater than 0, "
+                        "But received rank(Label) is %d",
+                        label.dims().size()));
+
+  PADDLE_ENFORCE_GE(label.dims()[0],
+                    inference.dims()[0],
+                    common::errors::InvalidArgument(
+                        "num_samples(%d) of Label should less than "
+                        "or equal to num_samples(%d) of Input",
+                        label.dims()[0],
+                        num_samples));
+
+  if (num_samples == 0) {
+    return;
+  }
+
+  AccuracyCudaKernel<PADDLE_CUDA_NUM_THREADS, T>
+      <<<1, PADDLE_CUDA_NUM_THREADS, 0, stream>>>(num_samples,
+                                                  infer_width,
+                                                  indices_data,
+                                                  label_data,
+                                                  correct_data,
+                                                  accuracy_data,
+                                                  total_data);
+}
+}  // namespace phi
+
+// FIXME(typhoonzero): types of T is for inference data.
+// label data is always int64
+PD_REGISTER_KERNEL(accuracy,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AccuracyKernel,
+                   phi::float16,
+                   phi::bfloat16,
+                   float,
+                   double) {
+  kernel->InputAt(1).SetDataType(phi::DataType::INT64);
+  kernel->InputAt(2).SetDataType(phi::DataType::INT64);
+  kernel->OutputAt(1).SetDataType(phi::DataType::INT32);
+  kernel->OutputAt(2).SetDataType(phi::DataType::INT32);
+}
+
 PD_CUSTOM_KERNEL_REGISTER(accuracy,
                           metax_gpu,
                           ALL_LAYOUT,
                           phi::AccuracyKernel,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
+                          phi::float16,
+                          phi::bfloat16,
                           float,
                           double) {
-  kernel->InputAt(1).SetDataType(phi::DataType::INT32);
-  kernel->InputAt(2).SetDataType(phi::DataType::INT32);
+  kernel->InputAt(1).SetDataType(phi::DataType::INT64);
+  kernel->InputAt(2).SetDataType(phi::DataType::INT64);
   kernel->OutputAt(1).SetDataType(phi::DataType::INT32);
   kernel->OutputAt(2).SetDataType(phi::DataType::INT32);
 }
diff --git a/backends/metax_gpu/patch/tmp/mixed_vector.cc b/backends/metax_gpu/patch/tmp/mixed_vector.cc
new file mode 100644
index 00000000000..a90113c7977
--- /dev/null
+++ b/backends/metax_gpu/patch/tmp/mixed_vector.cc
@@ -0,0 +1,111 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/mixed_vector.h"
+
+#include <algorithm>
+#include <initializer_list>
+#include <memory>
+#include <mutex>  // NOLINT
+#include <utility>
+#include <vector>
+
+#include "glog/logging.h"
+#include "paddle/phi/backends/context_pool.h"
+#include "paddle/phi/common/memory_utils.h"
+#include "paddle/utils/none.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename T>
+void CopyToCPUHelper(std::vector<T> *cpu_,
+                     phi::Allocator::AllocationPtr *gpu_,
+                     size_t *gpu_memory_size_) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  // COPY GPU Data To CPU
+  auto *dev_ctx = static_cast<phi::GPUContext *>(
+      phi::DeviceContextPool::Instance().Get((*gpu_)->place()));
+  auto stream = dev_ctx->stream();
+  void *src = (*gpu_)->ptr();
+  void *dst = cpu_->data();
+  auto place = dev_ctx->GetPlace();
+  if (place.GetType() == phi::AllocationType::GPU) {
+    memory_utils::Copy(phi::CPUPlace(),
+                       dst,
+                       OptionalCUDAPlace(*gpu_).get(),
+                       src,
+                       *gpu_memory_size_,
+                       stream);
+  } else {
+    memory_utils::Copy(phi::CPUPlace(),
+                       dst,
+                       OptionalCustomPlace(*gpu_).get(),
+                       src,
+                       *gpu_memory_size_,
+                       stream);
+  }
+  dev_ctx->Wait();
+#endif
+}
+
+template <typename T>
+void CopyCPUDataToCUDAHelper(std::vector<T> *cpu_,
+                             phi::Allocator::AllocationPtr *gpu_,
+                             size_t *gpu_memory_size_,
+                             const phi::Place &place) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  void *src = cpu_->data();
+  *gpu_memory_size_ = cpu_->size() * sizeof(T);  // sizeof(T)
+  (*gpu_) = memory_utils::Alloc(place, *gpu_memory_size_);
+  void *dst = (*gpu_)->ptr();
+  auto *dev_ctx = static_cast<phi::GPUContext *>(
+      phi::DeviceContextPool::Instance().Get(place));
+  auto stream = dev_ctx->stream();
+  if (place.GetType() == phi::AllocationType::GPU) {
+    memory_utils::Copy(OptionalCUDAPlace(*gpu_).get(),
+                       dst,
+                       phi::CPUPlace(),
+                       src,
+                       *gpu_memory_size_,
+                       stream);
+  } else {
+    memory_utils::Copy(OptionalCustomPlace(*gpu_).get(),
+                       dst,
+                       phi::CPUPlace(),
+                       src,
+                       *gpu_memory_size_,
+                       stream);
+  }
+  dev_ctx->Wait();
+#endif
+}
+
+#define INSTANTIATE_VECTOR_FOR_TYPE(__TYPE__)                                 \
+  template <>                                                                 \
+  void MixVector<__TYPE__>::VectorData::CopyToCPU() const {                   \
+    CopyToCPUHelper<__TYPE__>(cpu_, &gpu_, &gpu_memory_size_);                \
+  }                                                                           \
+                                                                              \
+  template <>                                                                 \
+  void MixVector<__TYPE__>::VectorData::CopyCPUDataToCUDA(                    \
+      const phi::Place &place) const {                                        \
+    CopyCPUDataToCUDAHelper<__TYPE__>(cpu_, &gpu_, &gpu_memory_size_, place); \
+  }
+
+INSTANTIATE_VECTOR_FOR_TYPE(size_t)
+INSTANTIATE_VECTOR_FOR_TYPE(int)
+INSTANTIATE_VECTOR_FOR_TYPE(int64_t)
+
+};  // namespace phi
diff --git a/backends/metax_gpu/patch/tmp/mixed_vector.h b/backends/metax_gpu/patch/tmp/mixed_vector.h
new file mode 100644
index 00000000000..e7cf1e626c9
--- /dev/null
+++ b/backends/metax_gpu/patch/tmp/mixed_vector.h
@@ -0,0 +1,413 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <initializer_list>
+#include <memory>
+#include <mutex>  // NOLINT
+#include <utility>
+#include <vector>
+
+#include "glog/logging.h"
+#include "paddle/common/errors.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/allocator.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/utils/none.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <class T>
+using Vector = std::vector<T>;
+
+inline paddle::optional<phi::GPUPlace> OptionalCUDAPlace(
+    const phi::Allocator::AllocationPtr &gpu_) {
+  return gpu_ == nullptr ? paddle::none
+                         : paddle::optional<phi::GPUPlace>(gpu_->place());
+}
+
+inline paddle::optional<phi::CustomPlace> OptionalCustomPlace(
+    const phi::Allocator::AllocationPtr &gpu_) {
+  return gpu_ == nullptr ? paddle::none
+                         : paddle::optional<phi::CustomPlace>(gpu_->place());
+}
+
+// Vector<T> implements the std::vector interface, and can get Data or
+// MutableData from any place. The data will be synced implicitly inside.
+template <typename T>
+class MixVector {
+ public:
+  using value_type = T;
+  using iterator = typename std::vector<T>::iterator;
+  using const_iterator = typename std::vector<T>::const_iterator;
+
+ private:
+  // The actual class to implement vector logic
+  class VectorData {
+   public:
+    template <typename U>
+    explicit VectorData(std::vector<U> *dat) : cpu_(dat), flag_(kDataInCPU) {}
+    ~VectorData() {}
+
+    VectorData(const VectorData &o) = delete;
+
+    VectorData &operator=(const VectorData &o) = delete;
+
+    T &operator[](size_t i) {
+      MutableCPU();
+      return (*cpu_)[i];
+    }
+
+    const T &operator[](size_t i) const {
+      ImmutableCPU();
+      return (*cpu_)[i];
+    }
+
+    size_t size() const { return (*cpu_).size(); }
+
+    iterator begin() {
+      MutableCPU();
+      return (*cpu_).begin();
+    }
+
+    iterator end() {
+      MutableCPU();
+      return (*cpu_).end();
+    }
+
+    T &front() {
+      MutableCPU();
+      return (*cpu_).front();
+    }
+
+    T &back() {
+      MutableCPU();
+      return (*cpu_).back();
+    }
+
+    const_iterator begin() const {
+      ImmutableCPU();
+      return (*cpu_).begin();
+    }
+
+    const_iterator end() const {
+      ImmutableCPU();
+      return (*cpu_).end();
+    }
+
+    const T &back() const {
+      ImmutableCPU();
+      return (*cpu_).back();
+    }
+
+    T *data() { return cpu_->data(); }
+
+    const T *data() const { return cpu_->data(); }
+
+    const T &front() const {
+      ImmutableCPU();
+      return (*cpu_).front();
+    }
+
+    // assign this from iterator.
+    // NOTE: the iterator must support `end-begin`
+    template <typename Iter>
+    void assign(Iter begin, Iter end) {
+      MutableCPU();
+      (*cpu_).assign(begin, end);
+    }
+
+    // push_back. If the previous capacity is not enough, the memory will
+    // double.
+    void push_back(T elem) {
+      MutableCPU();
+      (*cpu_).push_back(elem);
+    }
+
+    // extend a vector by iterator.
+    // NOTE: the iterator must support end-begin
+    template <typename It>
+    void Extend(It begin, It end) {
+      MutableCPU();
+      auto out_it = std::back_inserter<std::vector<T>>(*(this->cpu_));
+      std::copy(begin, end, out_it);
+    }
+
+    // resize the vector
+    void resize(size_t size) {
+      MutableCPU();
+      (*cpu_).resize(size);
+    }
+
+    // get cuda ptr. immutable
+    const T *CUDAData(phi::Place place) const {
+      PADDLE_ENFORCE_EQ(
+          place.GetType() == phi::AllocationType::GPU ||
+              place.GetType() == phi::AllocationType::CUSTOM,
+          true,
+          common::errors::Unavailable(
+              "Place mismatch, CUDA Data must be on CUDA place."));
+      ImmutableCUDA(place);
+      return reinterpret_cast<T *>(gpu_->ptr());
+    }
+
+    // get cuda ptr. mutable
+    T *CUDAMutableData(phi::Place place) {
+      const T *ptr = CUDAData(place);
+      flag_ = kDirty | kDataInCUDA;
+      return const_cast<T *>(ptr);
+    }
+
+    // clear
+    void clear() {
+      (*cpu_).clear();
+      flag_ = kDirty | kDataInCPU;
+    }
+
+    std::vector<T> *get_vector() { return cpu_; }
+
+    size_t capacity() const { return (*cpu_).capacity(); }
+
+    // reserve data
+    void reserve(size_t size) const { (*cpu_).reserve(size); }
+
+    std::mutex &Mutex() const { return mtx_; }
+
+    paddle::optional<phi::GPUPlace> CUDAPlace() const {
+      return OptionalCUDAPlace(gpu_);
+    }
+
+    paddle::optional<phi::CustomPlace> CustomPlace() const {
+      return OptionalCustomPlace(gpu_);
+    }
+
+    void MutableCPU() {
+      if (IsInCUDA() && IsDirty()) {
+        CopyToCPU();
+      }
+      flag_ = kDirty | kDataInCPU;
+    }
+
+   private:
+    enum DataFlag {
+      kDataInCPU = 0x01,
+      kDataInCUDA = 0x02,
+      // kDirty means the data has been changed in one device.
+      kDirty = 0x10
+    };
+
+    void CopyToCPU() const;
+
+    void ImmutableCUDA(phi::Place place) const {
+      if (IsDirty()) {
+        if (IsInCPU()) {
+          CopyCPUDataToCUDA(place);
+          UnsetFlag(kDirty);
+          SetFlag(kDataInCUDA);
+        } else if (IsInCUDA() && !(place == gpu_->place())) {
+          PADDLE_THROW(
+              common::errors::Unavailable("Unexpected data place mismatch."));
+          // Still dirty
+        } else {
+          // Dirty && DataInCUDA && Device is same
+          // Do nothing
+        }
+      } else {
+        if (!IsInCUDA()) {
+          // Even data is not dirty. However, data is not in CUDA. Copy data.
+          CopyCPUDataToCUDA(place);
+          SetFlag(kDataInCUDA);
+        } else if (!(place == gpu_->place())) {
+          PADDLE_THROW(
+              common::errors::Unavailable("Unexpected data place mismatch."));
+        } else {
+          // Not Dirty && DataInCUDA && Device is same
+          // Do nothing.
+        }
+      }
+    }
+
+    void CopyCPUDataToCUDA(const phi::Place &place) const;
+
+    void ImmutableCPU() const {
+      if (IsDirty() && !IsInCPU()) {  // If data has been changed in CUDA, or
+                                      // CPU has no data.
+        CopyToCPU();
+        UnsetFlag(kDirty);
+      }
+      SetFlag(kDataInCPU);
+    }
+
+    void UnsetFlag(int flag) const { flag_ &= ~flag; }
+    void SetFlag(int flag) const { flag_ |= flag; }
+
+    bool IsDirty() const { return flag_ & kDirty; }
+
+    bool IsInCUDA() const { return flag_ & kDataInCUDA; }
+
+    bool IsInCPU() const { return flag_ & kDataInCPU; }
+
+    std::vector<T> *cpu_;
+    mutable phi::Allocator::AllocationPtr gpu_;
+    mutable size_t gpu_memory_size_{0};
+    mutable int flag_;
+
+    mutable std::mutex mtx_;
+  };
+
+ public:
+  // implicit cast from std::vector.
+  template <typename U>
+  MixVector(const std::vector<U> *dat) {  // NOLINT
+    m_.reset(new VectorData(const_cast<std::vector<U> *>(dat)));
+  }
+
+  // Copy ctor
+  MixVector(const MixVector<T> &other) = delete;
+
+  // Copy operator
+  MixVector<T> &operator=(const MixVector<T> &other) = delete;
+
+  // Move ctor
+  MixVector(MixVector<T> &&other) = delete;
+
+  // CPU data access method. Mutable.
+  T &operator[](size_t i) { return (*m_)[i]; }
+
+  // CPU data access method. Immutable.
+  const T &operator[](size_t i) const { return (*m_)[i]; }
+
+  // std::vector iterator methods. Based on CPU data access method
+  size_t size() const { return m_->size(); }
+
+  iterator begin() { return m_->begin(); }
+
+  iterator end() { return m_->end(); }
+
+  T &front() { return m_->front(); }
+
+  T &back() { return m_->back(); }
+
+  const_iterator begin() const { return m_->begin(); }
+
+  const_iterator end() const { return m_->end(); }
+
+  const_iterator cbegin() const { return begin(); }
+
+  const_iterator cend() const { return end(); }
+
+  const T &back() const { return m_->back(); }
+
+  T *data() { return m_->data(); }
+
+  const T *data() const { return m_->data(); }
+
+  const T &front() const { return m_->front(); }
+  // end of std::vector iterator methods
+
+  // assign this from iterator.
+  // NOTE: the iterator must support `end-begin`
+  template <typename Iter>
+  void assign(Iter begin, Iter end) {
+    m_->assign(begin, end);
+  }
+
+  // push_back. If the previous capacity is not enough, the memory will
+  // double.
+  void push_back(T elem) { m_->push_back(elem); }
+
+  // extend a vector by iterator.
+  // NOTE: the iterator must support end-begin
+  template <typename It>
+  void Extend(It begin, It end) {
+    m_->Extend(begin, end);
+  }
+
+  // resize the vector
+  void resize(size_t size) {
+    if (m_->size() != size) {
+      m_->resize(size);
+    }
+  }
+
+  // get cuda ptr. immutable
+  const T *CUDAData(phi::Place place) const {
+    {
+      phi::GPUPlace p(place.GetDeviceId());
+      auto &mtx = m_->Mutex();
+      std::lock_guard<std::mutex> guard(mtx);
+      auto cuda_place = m_->CUDAPlace();
+      if (cuda_place == paddle::none || cuda_place == p) {
+        return m_->CUDAData(place);
+      }
+    }
+    m_->MutableCPU();
+    m_.reset(new VectorData(m_->get_vector()));
+    return CUDAData(place);
+  }
+
+  // get cuda ptr. mutable
+  T *CUDAMutableData(phi::Place place) {
+    {
+      phi::GPUPlace p(place.GetDeviceId());
+      auto &mtx = m_->Mutex();
+      std::lock_guard<std::mutex> guard(mtx);
+      auto cuda_place = m_->CUDAPlace();
+      if (cuda_place == paddle::none || cuda_place == p) {
+        return m_->CUDAMutableData(place);
+      }
+    }
+    m_->MutableCPU();
+    m_.reset(new VectorData(m_->get_vector()));
+    return CUDAMutableData(place);
+  }
+
+  // clear
+  void clear() { m_->clear(); }
+
+  size_t capacity() const { return m_->capacity(); }
+
+  // reserve data
+  void reserve(size_t size) { m_->reserve(size); }
+
+  // the unify method to access CPU or CUDA data. immutable.
+  const T *Data(phi::Place place) const {
+    if (place.GetType() == phi::AllocationType::GPU) {
+      return CUDAData(place);
+    } else {
+      return data();
+    }
+  }
+
+  // the unify method to access CPU or CUDA data. mutable.
+  T *MutableData(phi::Place place) {
+    if (place.GetType() == phi::AllocationType::GPU) {
+      return CUDAMutableData(place);
+    } else {
+      return data();
+    }
+  }
+
+  void CopyToCPU() { m_->MutableCPU(); }
+
+  const void *Handle() const { return m_.get(); }
+
+ private:
+  mutable std::unique_ptr<VectorData> m_;
+};
+
+};  // namespace phi
diff --git a/backends/metax_gpu/tests/unittest/test_accuracy_op_metax.py b/backends/metax_gpu/tests/unittest/test_accuracy_op_metax.py
new file mode 100644
index 00000000000..910ef5cd1a6
--- /dev/null
+++ b/backends/metax_gpu/tests/unittest/test_accuracy_op_metax.py
@@ -0,0 +1,206 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from op_test import (
+    OpTest,
+    convert_float_to_uint16,
+    paddle_static_guard,
+    is_custom_device,
+    get_device_place,
+)
+
+import paddle
+from paddle import base
+from paddle.base import Program, core, program_guard
+
+
+def accuracy_wrapper(infer, indices, label):
+    return paddle._C_ops.accuracy(infer, indices, label)
+
+
+class TestAccuracyOp(OpTest):
+    def setUp(self):
+        self.op_type = "accuracy"
+        self.python_api = accuracy_wrapper
+        self.dtype = np.float32
+        self.init_dtype()
+        n = 8192
+        infer = np.random.random((n, 1)).astype(self.dtype)
+        indices = np.random.randint(0, 2, (n, 1)).astype("int64")
+        label = np.random.randint(0, 2, (n, 1)).astype("int64")
+        self.inputs = {"Out": infer, "Indices": indices, "Label": label}
+        num_correct = 0
+        for rowid in range(n):
+            for ele in indices[rowid]:
+                if ele == label[rowid]:
+                    num_correct += 1
+                    break
+        self.outputs = {
+            "Accuracy": np.array(num_correct / float(n)).astype(self.dtype),
+            "Correct": np.array(num_correct).astype("int32"),
+            "Total": np.array(n).astype("int32"),
+        }
+
+    def init_dtype(self):
+        pass
+
+    def test_check_output(self):
+        self.check_output(check_pir=True)
+
+
+class TestAccuracyOpFp16(TestAccuracyOp):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output(atol=1e-3, check_pir=True)
+
+
+@unittest.skipIf(
+    not (core.is_compiled_with_cuda() or is_custom_device())
+    or not core.is_bfloat16_supported(get_device_place()),
+    "core is not compiled with CUDA and not support the bfloat16",
+)
+class TestAccuracyOpBf16(OpTest):
+    def setUp(self):
+        self.op_type = "accuracy"
+        self.python_api = accuracy_wrapper
+        self.init_dtype()
+        n = 8192
+        infer = np.random.random((n, 1)).astype(np.float32)
+        indices = np.random.randint(0, 2, (n, 1)).astype("int64")
+        label = np.random.randint(0, 2, (n, 1)).astype("int64")
+        self.inputs = {
+            "Out": convert_float_to_uint16(infer),
+            "Indices": indices,
+            "Label": label,
+        }
+        num_correct = 0
+        for rowid in range(n):
+            for ele in indices[rowid]:
+                if ele == label[rowid]:
+                    num_correct += 1
+                    break
+        self.outputs = {
+            "Accuracy": convert_float_to_uint16(
+                np.array(num_correct / float(n)).astype(np.float32)
+            ),
+            "Correct": np.array(num_correct).astype("int32"),
+            "Total": np.array(n).astype("int32"),
+        }
+
+    def init_dtype(self):
+        self.dtype = np.uint16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda() or is_custom_device():
+            place = get_device_place()
+            self.check_output_with_place(place, atol=1e-2, check_pir=True)
+
+
+class TestAccuracyOpError(unittest.TestCase):
+    def test_type_errors(self):
+        with (
+            paddle_static_guard(),
+            program_guard(Program(), Program()),
+        ):
+            # The input type of accuracy_op must be Variable.
+            x1 = base.create_lod_tensor(np.array([[-1]]), [[1]], base.CPUPlace())
+            label = paddle.static.data(name="label", shape=[-1, 1], dtype="int32")
+            self.assertRaises(TypeError, paddle.static.accuracy, x1, label)
+            self.assertRaises(TypeError, paddle.metric.accuracy, x1, label)
+            # The input dtype of accuracy_op must be float32 or float64.
+            x2 = paddle.static.data(name="x2", shape=[-1, 4], dtype="int32")
+            self.assertRaises(TypeError, paddle.static.accuracy, x2, label)
+            self.assertRaises(TypeError, paddle.metric.accuracy, x2, label)
+
+            x3 = paddle.static.data(name="input", shape=[-1, 2], dtype="float32")
+            paddle.static.accuracy(input=x3, label=label)
+            paddle.metric.accuracy(input=x3, label=label)
+
+    def test_value_errors(self):
+        with (
+            program_guard(Program(), Program()),
+            # The input rank of accuracy_op must be 2.
+            self.assertRaises(ValueError),
+        ):
+            x3 = paddle.to_tensor([0.1], dtype="float32")
+            label3 = paddle.to_tensor(np.reshape([0], [1, 1]), dtype="int32")
+            paddle.metric.accuracy(x3, label3)
+
+
+class TestAccuracyAPI1(unittest.TestCase):
+    def run_api(self, accuracy_api):
+        with (
+            paddle_static_guard(),
+            paddle.static.program_guard(paddle.static.Program()),
+        ):
+            self.predictions = paddle.static.data(
+                shape=[2, 5], name="predictions", dtype="float32"
+            )
+            self.label = paddle.static.data(shape=[2, 1], name="labels", dtype="int64")
+            self.result = accuracy_api(input=self.predictions, label=self.label, k=1)
+            self.input_predictions = np.array(
+                [[0.2, 0.1, 0.4, 0.1, 0.1], [0.2, 0.3, 0.1, 0.15, 0.25]],
+                dtype="float32",
+            )
+            self.input_labels = np.array([[2], [0]], dtype="int64")
+            self.expect_value = np.array([0.5], dtype="float32")
+            exe = paddle.static.Executor()
+            (result,) = exe.run(
+                feed={
+                    "predictions": self.input_predictions,
+                    "labels": self.input_labels,
+                },
+                fetch_list=[self.result],
+            )
+            self.assertEqual((result == self.expect_value).all(), True)
+
+    def test_api(self):
+        self.run_api(accuracy_api=paddle.static.accuracy)
+        self.run_api(accuracy_api=paddle.metric.accuracy)
+
+
+class TestAccuracyAPI2(unittest.TestCase):
+    def test_api(self):
+        with base.dygraph.guard():
+            predictions = paddle.to_tensor(
+                [[0.2, 0.1, 0.4, 0.1, 0.1], [0.2, 0.3, 0.1, 0.15, 0.25]],
+                dtype="float32",
+            )
+            label = paddle.to_tensor([[2], [0]], dtype="int64")
+            result = paddle.static.accuracy(input=predictions, label=label, k=1)
+            expect_value = np.array([0.5], dtype="float32")
+            self.assertEqual((result.numpy() == expect_value).all(), True)
+
+
+class TestAccuracyAPI(unittest.TestCase):
+    def test_api(self):
+        with base.dygraph.guard():
+            predictions = paddle.to_tensor(
+                [[0.2, 0.1, 0.4, 0.1, 0.1], [0.2, 0.3, 0.1, 0.15, 0.25]],
+                dtype="float32",
+            )
+            label = paddle.to_tensor([[2], [0]], dtype="int64")
+            result = paddle.metric.accuracy(input=predictions, label=label, k=1)
+            expect_value = np.array([0.5], dtype="float32")
+
+            self.assertEqual((result.numpy() == expect_value).all(), True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unittest/test_gather_op_metax.py b/backends/metax_gpu/tests/unittest/test_gather_op_metax.py
index bdf116571f7..3ce39588838 100644
--- a/backends/metax_gpu/tests/unittest/test_gather_op_metax.py
+++ b/backends/metax_gpu/tests/unittest/test_gather_op_metax.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,14 +12,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
 import unittest
-from op_test import OpTest
 
 import numpy as np
-import paddle
+from op_test import (
+    OpTest,
+    convert_float_to_uint16,
+    get_devices,
+    is_custom_device,
+    get_device_place,
+)
+from utils import dygraph_guard
 
-paddle.enable_static()
+import paddle
+from paddle import base
+from paddle.base.dygraph.base import switch_to_static_graph
+from paddle.framework import core
 
 
 def gather_numpy(x, index, axis):
@@ -32,29 +40,119 @@ def gather_numpy(x, index, axis):
 class TestGatherOp(OpTest):
     def setUp(self):
         self.op_type = "gather"
-        self.place = paddle.CustomPlace("metax_gpu", 0)
-        self.__class__.use_custom_device = True
         self.python_api = paddle.gather
+        self.public_python_api = paddle.gather
         self.config()
-        xnp = np.random.random(self.x_shape).astype(self.x_type)
-        self.inputs = {"X": xnp, "Index": np.array(self.index).astype(self.index_type)}
-        self.outputs = {"Out": self.inputs["X"][self.inputs["Index"]]}
+        self.prim_op_type = "prim"
+        self.init_inputs_and_outputs()
+        self.if_enable_cinn()
 
     def test_check_output(self):
-        self.check_output_with_place(self.place)
+        self.check_output(check_pir=True, check_symbol_infer=False)
 
     def test_check_grad(self):
-        self.check_grad_with_place(self.place, ["X"], "Out")
+        self.check_grad(["X"], "Out", check_pir=True, check_prim_pir=True)
 
     def config(self):
         """
         For multi-dimension input
         """
         self.x_shape = (10, 20)
-        self.x_type = "float32"
+        self.config_dtype()
         self.index = [1, 3, 5]
         self.index_type = "int32"
 
+    def config_dtype(self):
+        self.x_type = "float64"
+
+    def init_inputs_and_outputs(self):
+        xnp = np.random.random(self.x_shape).astype(self.x_type)
+        if self.x_type == "complex64" or self.x_type == "cpmolex128":
+            xnp = (
+                np.random.randint(-10, 10, size=(10, 10))
+                + 1j * np.random.randint(-10, 10, size=(10, 10))
+            ).astype(self.x_type)
+        self.inputs = {
+            "X": xnp,
+            "Index": np.array(self.index).astype(self.index_type),
+        }
+        self.outputs = {"Out": self.inputs["X"][self.inputs["Index"]]}
+
+    def if_enable_cinn(self):
+        pass
+
+
+class TestGatherOp_ZeroDim(TestGatherOp):
+    def config(self):
+        """
+        For multi-dimension input
+        """
+        self.x_shape = 100
+        self.config_dtype()
+        self.index = 2
+        self.index_type = "int32"
+
+    def if_enable_cinn(self):
+        self.enable_cinn = False
+
+
+class TestGatherOpFP16(TestGatherOp):
+    def config_dtype(self):
+        self.x_type = "float16"
+
+
+# @unittest.skipIf(
+#     not (core.is_compiled_with_cuda() or is_custom_device())
+#     # or core.cudnn_version() < 8100
+#     # or paddle.device.cuda.get_device_capability()[0] < 8,
+#     # "only support compiled with CUDA and cudnn version need larger than 8.1.0 and device's compute capability is at least 8.0",
+# )
+class TestGatherOpBFP16(TestGatherOp):
+    def config_dtype(self):
+        self.x_type = "float32"
+        self.dtype = np.uint16
+
+    def init_inputs_and_outputs(self):
+        xnp = np.random.random(self.x_shape).astype(self.x_type)
+        self.inputs = {
+            "X": convert_float_to_uint16(xnp),
+            "Index": np.array(self.index).astype(self.index_type),
+        }
+        self.outputs = {"Out": convert_float_to_uint16(xnp[self.inputs["Index"]])}
+
+    def if_enable_cinn(self):
+        self.enable_cinn = False
+
+    def test_check_output(self):
+        self.check_output_with_place(
+            place=get_device_place(), check_pir=True, check_symbol_infer=False
+        )
+
+    def test_check_grad(self):
+        self.check_grad_with_place(
+            get_device_place(),
+            ["X"],
+            "Out",
+            check_pir=True,
+            check_prim_pir=True,
+        )
+
+
+class TestGatherOpComplex64(TestGatherOp):
+    def config_dtype(self):
+        self.x_type = "complex64"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestGatherOpComplex128(TestGatherOp):
+    def config_dtype(self):
+        self.x_type = "complex128"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
 
 class TestCase1(TestGatherOp):
     def config(self):
@@ -62,10 +160,42 @@ def config(self):
         For one dimension input
         """
         self.x_shape = 100
-        self.x_type = "float32"
+        self.config_dtype()
         self.index = [1, 3, 5]
         self.index_type = "int32"
 
+    def config_dtype(self):
+        self.x_type = "float64"
+
+
+class TestCase1FP16(TestCase1):
+    def config_dtype(self):
+        self.x_type = "float16"
+
+
+class TestCase1BFP16(TestGatherOpBFP16):
+    def config(self):
+        self.x_shape = 100
+        self.config_dtype()
+        self.index = [1, 3, 5]
+        self.index_type = "int32"
+
+
+class TestCase1Complex64(TestCase1):
+    def config_dtype(self):
+        self.x_type = "complex64"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestCase1Complex128(TestCase1):
+    def config_dtype(self):
+        self.x_type = "complex128"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
 
 class TestCase2(TestGatherOp):
     def config(self):
@@ -73,42 +203,574 @@ def config(self):
         For int64_t index type
         """
         self.x_shape = 100
-        self.x_type = "float32"
+        self.config_dtype()
+        self.index = [1, 3, 5]
+        self.index_type = "int64"
+
+    def config_dtype(self):
+        self.x_type = "float64"
+
+
+class TestCase2FP16(TestCase2):
+    def config_dtype(self):
+        self.x_type = "float16"
+
+
+class TestCase2BFP16(TestGatherOpBFP16):
+    def config(self):
+        self.x_shape = 100
+        self.config_dtype()
+        self.index = [1, 3, 5]
+        self.index_type = "int64"
+
+
+class TestCase2Complex64(TestCase2):
+    def config_dtype(self):
+        self.x_type = "complex64"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestCase2Complex128(TestCase2):
+    def config_dtype(self):
+        self.x_type = "complex128"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestCase3(TestGatherOp):
+    def config(self):
+        """
+        For other input type
+        """
+        self.x_shape = (10, 20)
+        self.config_dtype()
+        self.index = [1, 3, 5]
+        self.index_type = "int64"
+
+    def config_dtype(self):
+        self.x_type = "float64"
+
+
+class TestCase3Fp16(TestCase3):
+    def config_dtype(self):
+        self.x_type = "float16"
+
+
+class TestCase3BFP16(TestGatherOpBFP16):
+    def config(self):
+        self.x_shape = (10, 20)
+        self.config_dtype()
         self.index = [1, 3, 5]
         self.index_type = "int64"
 
 
+class TestCase3Complex64(TestCase3):
+    def config_dtype(self):
+        self.x_type = "complex64"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestCase3Complex128(TestCase3):
+    def config_dtype(self):
+        self.x_type = "complex128"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestCase4(TestGatherOp):
+    def config(self):
+        self.x_shape = (10, 20)
+        self.attrs = {"overwrite": False}
+        self.config_dtype()
+        self.index = [1, 1]
+        self.index_type = "int32"
+
+    def config_dtype(self):
+        self.x_type = "float64"
+
+
+class TestCase4FP16(TestCase4):
+    def config_dtype(self):
+        self.x_type = "float16"
+
+
+class TestCase4BFP16(TestGatherOpBFP16):
+    def config(self):
+        self.x_shape = (10, 20)
+        self.attrs = {"overwrite": False}
+        self.config_dtype()
+        self.index = [1, 1]
+        self.index_type = "int32"
+
+
+class TestCase4Complex64(TestCase4):
+    def config_dtype(self):
+        self.x_type = "complex64"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestCase4Complex128(TestCase4):
+    def config_dtype(self):
+        self.x_type = "complex128"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestCase5(TestGatherOp):
+    def config(self):
+        self.x_shape = (10, 20)
+        self.attrs = {"overwrite": False}
+        self.config_dtype()
+        self.index = [1, 1, 3]
+        self.index_type = "int32"
+
+    def config_dtype(self):
+        self.x_type = "float64"
+
+
+class TestCase5BFP16(TestGatherOpBFP16):
+    def config(self):
+        self.x_shape = (10, 20)
+        self.attrs = {"overwrite": False}
+        self.config_dtype()
+        self.index = [1, 1]
+        self.index_type = "int32"
+
+
+class TestCase5FP16(TestCase5):
+    def config_dtype(self):
+        self.x_type = "float16"
+
+
+class TestCase5Complex64(TestCase5):
+    def config_dtype(self):
+        self.x_type = "complex64"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestCase5Complex128(TestCase5):
+    def config_dtype(self):
+        self.x_type = "complex128"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestCase6(TestGatherOp):
+    def config(self):
+        self.x_shape = (10, 20)
+        self.attrs = {"overwrite": True}
+        self.config_dtype()
+        self.index = [1, 3]
+        self.index_type = "int32"
+
+    def config_dtype(self):
+        self.x_type = "float64"
+
+
+class TestCase6FP16(TestCase6):
+    def config_dtype(self):
+        self.x_type = "float16"
+
+
+class TestCase6BFP16(TestGatherOpBFP16):
+    def config(self):
+        self.x_shape = (10, 20)
+        self.attrs = {"overwrite": True}
+        self.config_dtype()
+        self.index = [1, 3]
+        self.index_type = "int32"
+
+
+class TestGatherBF16Op(OpTest):
+    def setUp(self):
+        self.op_type = "gather"
+        self.python_api = paddle.gather
+        self.dtype = np.uint16
+        self.config()
+        xnp = np.random.random(self.x_shape).astype(np.float32)
+        axis_np = np.array(self.axis).astype(self.axis_type)
+        index_np = np.array(self.index).astype(self.index_type)
+        self.inputs = {
+            "X": convert_float_to_uint16(xnp),
+            "Index": index_np,
+            "Axis": axis_np,
+        }
+        out = gather_numpy(self.inputs["X"], index_np, axis_np[0])
+        self.outputs = {"Out": out}
+
+    def test_check_output(self):
+        self.check_output(check_pir=True, check_symbol_infer=False)
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out", numeric_grad_delta=0.5, check_pir=True)
+
+    def config(self):
+        """
+        For multi-dimension input
+        """
+        self.x_shape = (3, 88, 3)
+        self.index = [1, 3, 5]
+        self.index_type = "int32"
+        self.axis = [1]
+        self.axis_type = "int32"
+
+
+class TestGatherNegativeAxis(OpTest):
+    def setUp(self):
+        self.op_type = "gather"
+        self.python_api = paddle.gather
+        self.dtype = np.uint16
+        self.config()
+        xnp = np.random.random(self.x_shape).astype(np.float32)
+        axis_np = np.array(self.axis).astype(self.axis_type)
+        index_np = np.array(self.index).astype(self.index_type)
+        self.inputs = {
+            "X": convert_float_to_uint16(xnp),
+            "Index": index_np,
+            "Axis": axis_np,
+        }
+        out = gather_numpy(self.inputs["X"], index_np, axis_np[0])
+        self.outputs = {"Out": out}
+
+    def test_check_output(self):
+        places = [paddle.CPUPlace()]
+        if core.is_compiled_with_cuda() or is_custom_device():
+            places.append(get_device_place())
+        for place in places:
+            self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        places = [paddle.CPUPlace()]
+        if core.is_compiled_with_cuda() or is_custom_device():
+            places.append(get_device_place())
+        for place in places:
+            self.check_grad_with_place(place, ["X"], "Out", numeric_grad_delta=0.5)
+
+    def config(self):
+        """
+        For multi-dimension input
+        """
+        self.x_shape = (100, 3)
+        self.index = [0, 1, -2]
+        self.index_type = "int32"
+        self.axis = [-1]
+        self.axis_type = "int32"
+
+
+class TestOutOfRangeError(unittest.TestCase):
+    def test_dygraph_forward_and_backward(self):
+        with dygraph_guard():
+            x = paddle.randn([100, 3]).cpu()
+            x.stop_gradient = False
+            y = paddle.gather(
+                x,
+                paddle.to_tensor([0, -2]).cpu(),
+                axis=-1,
+            )
+            grad_x = paddle.grad(y, x)
+
+    def test_dygraph_error(self):
+        with dygraph_guard():
+            # out of lower bound
+            with self.assertRaises(IndexError):
+                _ = paddle.gather(
+                    paddle.randn([100, 3]).cpu(),
+                    paddle.to_tensor([0, -4]).cpu(),
+                    axis=1,
+                )
+            # out of upper bound
+            with self.assertRaises(IndexError):
+                _ = paddle.gather(
+                    paddle.randn([100, 3]).cpu(),
+                    paddle.to_tensor([0, 3]).cpu(),
+                    axis=1,
+                )
+
+
+class TestCase6Complex64(TestCase6):
+    def config_dtype(self):
+        self.x_type = "complex64"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestCase6Complex128(TestCase6):
+    def config_dtype(self):
+        self.x_type = "complex128"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestGatherOp1(OpTest):
+    def setUp(self):
+        self.op_type = "gather"
+        self.python_api = paddle.gather
+        self.config()
+        xnp = np.random.random(self.x_shape).astype(self.x_type)
+        axis_np = np.array(self.axis).astype(self.index_type)
+        index_np = np.array(self.index).astype(self.index_type)
+        out = gather_numpy(xnp, index_np, axis_np[0])
+        self.inputs = {"X": xnp, "Index": index_np, "Axis": axis_np}
+        self.outputs = {"Out": out}
+
+    def test_check_output(self):
+        self.check_output(check_pir=True, check_symbol_infer=False)
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out", check_pir=True)
+
+    def config(self):
+        """
+        For multi-dimension input
+        """
+        self.x_shape = (3, 88, 3)
+        self.config_dtype()
+        self.index = [1, 3, 5]
+        self.index_type = "int32"
+        self.axis = [1]
+        self.axis_type = "int32"
+
+    def config_dtype(self):
+        self.x_type = "float64"
+
+
+class TestGatherOp1FP16(TestGatherOp1):
+    def config_dtype(self):
+        self.x_type = "float16"
+
+
+class TestGatherOp1Complex64(TestGatherOp1):
+    def config_dtype(self):
+        self.x_type = "complex64"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestGatherOp1Complex128(TestGatherOp1):
+    def config_dtype(self):
+        self.x_type = "complex128"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestGatherOp2(TestGatherOp1):
+    def config(self):
+        """
+        For multi-dimension input
+        """
+        self.x_shape = (10, 88, 10)
+        self.config_dtype()
+        self.index = [1, 3, 5]
+        self.index_type = "int64"
+        self.axis = [0]
+        self.axis_type = "int32"
+
+    def config_dtype(self):
+        self.x_type = "float64"
+
+
+class TestGatherOp2FP16(TestGatherOp2):
+    def config_dtype(self):
+        self.x_type = "float16"
+
+
+class TestGatherOp2Complex64(TestGatherOp2):
+    def config_dtype(self):
+        self.x_type = "complex64"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestGatherOp2Complex128(TestGatherOp2):
+    def config_dtype(self):
+        self.x_type = "complex128"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestGatherOp3(TestGatherOp1):
+    def config(self):
+        """
+        For multi-dimension input
+        """
+        self.x_shape = (10, 88, 10)
+        self.config_dtype()
+        self.index = [1, 3, 5]
+        self.index_type = "int64"
+        self.axis = [2]
+        self.axis_type = "int32"
+
+    def config_dtype(self):
+        self.x_type = "float64"
+
+
+class TestGatherOp3FP16(TestGatherOp3):
+    def config_dtype(self):
+        self.x_type = "float16"
+
+
+class TestGatherOp3Complex64(TestGatherOp3):
+    def config_dtype(self):
+        self.x_type = "complex64"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestGatherOp3Complex128(TestGatherOp3):
+    def config_dtype(self):
+        self.x_type = "complex128"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestGatherOp4(TestGatherOp1):
+    def config(self):
+        """
+        For multi-dimension input
+        """
+        self.x_shape = (3, 100, 10)
+        self.config_dtype()
+        self.index = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+        self.index_type = "int64"
+        self.axis = [0]
+        self.axis_type = "int32"
+        self.attrs = {"overwrite": False}
+
+    def config_dtype(self):
+        self.x_type = "float64"
+
+
+class TestGatherOp4FP16(TestGatherOp4):
+    def config_dtype(self):
+        self.x_type = "float16"
+
+
+class TestGatherOp4Complex64(TestGatherOp4):
+    def config_dtype(self):
+        self.x_type = "complex64"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestGatherOp4Complex128(TestGatherOp4):
+    def config_dtype(self):
+        self.x_type = "complex128"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestGatherOp5(TestGatherOp):
+    def config(self):
+        """
+        Test for negative axis
+        """
+        self.x_shape = (3, 100, 10)
+        self.config_dtype()
+        self.index = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+        self.index_type = "int64"
+        self.axis = [-1]
+        self.axis_type = "int32"
+        self.attrs = {"overwrite": False}
+
+    def config_dtype(self):
+        self.x_type = "float64"
+
+    def test_check_grad(self):
+        self.check_grad(
+            ["X"],
+            "Out",
+            check_pir=True,
+            check_prim_pir=True,
+        )
+
+
+class API_TestGather(unittest.TestCase):
+    def test_out1(self):
+        with base.program_guard(base.Program(), base.Program()):
+            data1 = paddle.static.data("data1", shape=[-1, 2], dtype="float64")
+            index = paddle.static.data("index", shape=[-1, 1], dtype="int64")
+            out = paddle.gather(data1, index)
+            place = base.CPUPlace()
+            exe = base.Executor(place)
+            input = np.array([[1, 2], [3, 4], [5, 6]]).astype("float64")
+            index_1 = np.array([1, 2]).astype("int64")
+            (result,) = exe.run(
+                feed={"data1": input, "index": index_1}, fetch_list=[out]
+            )
+            expected_output = np.array([[3, 4], [5, 6]])
+        np.testing.assert_allclose(result, expected_output, rtol=1e-05)
+
+    def test_out2(self):
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
+            x = paddle.static.data("x", shape=[-1, 2], dtype="float64")
+            index = paddle.static.data("index", shape=[-1, 1], dtype="int32")
+            axis = paddle.static.data("axis", shape=[1], dtype="int32")
+            out = paddle.gather(x, index, axis)
+            place = paddle.CPUPlace()
+            exe = paddle.static.Executor(place)
+            x_np = np.array([[1, 2], [3, 4], [5, 6]]).astype("float64")
+            index_np = np.array([1, 1]).astype("int32")
+            axis_np = np.array([1]).astype("int32")
+            (result,) = exe.run(
+                feed={"x": x_np, "index": index_np, "axis": axis_np},
+                fetch_list=[out],
+            )
+            expected_output = gather_numpy(x_np, index_np, axis_np[0])
+        np.testing.assert_allclose(result, expected_output, rtol=1e-05)
+
+
 class API_TestDygraphGather(unittest.TestCase):
     def test_out1(self):
-        paddle.set_device("metax_gpu")
         paddle.disable_static()
-        input_1 = np.array([[1, 2], [3, 4], [5, 6]]).astype("int32")
+        input_1 = np.array([[1, 2], [3, 4], [5, 6]])
         index_1 = np.array([1, 2])
         input = paddle.to_tensor(input_1)
         index = paddle.to_tensor(index_1)
         output = paddle.gather(input, index)
         output_np = output.numpy()
-        expected_output = np.array([[3, 4], [5, 6]]).astype("int32")
-        np.testing.assert_allclose(output_np, expected_output)
+        expected_output = np.array([[3, 4], [5, 6]])
+        np.testing.assert_allclose(output_np, expected_output, rtol=1e-05)
         paddle.enable_static()
 
     def test_out12(self):
-        paddle.set_device("metax_gpu")
         paddle.disable_static()
-        input_1 = np.array([[1, 2], [3, 4], [5, 6]]).astype("int32")
+        input_1 = np.array([[1, 2], [3, 4], [5, 6]])
         index_1 = np.array([1, 2])
         x = paddle.to_tensor(input_1)
         index = paddle.to_tensor(index_1)
         output = paddle.gather(x, index, axis=0)
         output_np = output.numpy()
         expected_output = gather_numpy(input_1, index_1, axis=0)
-        np.testing.assert_allclose(output_np, expected_output)
+        np.testing.assert_allclose(output_np, expected_output, rtol=1e-05)
         paddle.enable_static()
 
     def test_zero_index(self):
-        paddle.set_device("metax_gpu")
         paddle.disable_static()
-        x = paddle.to_tensor([[1, 2], [3, 4]]).astype("int32")
+        x = paddle.to_tensor([[1, 2], [3, 4]])
         index = paddle.to_tensor(np.array([]).astype("int64"))
         for axis in range(len(x.shape)):
             out = paddle.gather(x, index, axis)
@@ -117,122 +779,197 @@ def test_zero_index(self):
             self.assertEqual(list(out.shape), expected_shape)
         paddle.enable_static()
 
+    def test_large_data(self):
+        if not paddle.is_compiled_with_cuda():
+            return
 
-class TestGathertError(unittest.TestCase):
-    def setUp(self) -> None:
-        self.place = paddle.CustomPlace("metax_gpu", 0)
-        paddle.set_device("metax_gpu:0")
+        x = np.random.rand(226862, 256).astype("float32")
+        index = np.random.randint(-226862, 22682, size=(8859027))
 
-    def test_error1(self):
-        paddle.enable_static()
-        if not paddle.framework.use_pir_api():
+        def test_dygraph():
+            with base.dygraph.guard():
+                gpu_out = paddle.gather(paddle.to_tensor(x), paddle.to_tensor(index))
+                return gpu_out.numpy()
+
+        @switch_to_static_graph
+        def test_static_graph():
             with paddle.static.program_guard(
                 paddle.static.Program(), paddle.static.Program()
             ):
-
-                input_shape = [8, 9, 6]
-                index_shape = [4]
-                x_int8 = paddle.static.data(
-                    shape=input_shape, dtype="int8", name="x_int8"
-                )
-                x_float32 = paddle.static.data(
-                    shape=input_shape, dtype="float32", name="x_float32"
-                )
-                axis = paddle.static.data(shape=[1], dtype="float32", name="axis")
-                index = paddle.static.data(
-                    shape=index_shape, dtype="int32", name="index"
-                )
-                index_float = paddle.static.data(
-                    shape=index_shape, dtype="float32", name="index_float"
+                x_t = paddle.static.data(name="x", dtype=x.dtype, shape=x.shape)
+                index_t = paddle.static.data(
+                    name="index", dtype=index.dtype, shape=index.shape
                 )
+                out_t = paddle.gather(x_t, index_t)
+                feed = {x_t.name: x, index_t.name: index}
+                fetch = [out_t]
 
-                def test_x_type():
-                    paddle.gather(x_int8, index)
+                gpu_exe = paddle.static.Executor(get_device_place())
+                gpu_value = gpu_exe.run(feed=feed, fetch_list=fetch)[0]
+                return gpu_value
 
-                self.assertRaises(TypeError, test_x_type)
+        np.testing.assert_array_equal(test_dygraph(), test_static_graph())
 
-                def test_index_type():
-                    paddle.gather(x_float32, index_float)
 
-                self.assertRaises(TypeError, test_index_type)
+class TestGathertError(unittest.TestCase):
+    def test_error1(self):
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
+            shape = [8, 9, 6]
+            x = paddle.static.data(shape=shape, dtype="int8", name="x")
+            axis = paddle.static.data(shape=[1], dtype="float32", name="axis")
+            index = paddle.static.data(shape=shape, dtype="int32", name="index")
+            index_float = paddle.static.data(
+                shape=shape, dtype="float32", name="index_float"
+            )
+
+            def test_x_type():
+                paddle.gather(x, index)
+
+            self.assertRaises((TypeError, ValueError), test_x_type)
+
+            def test_index_type():
+                paddle.gather(x, index_float)
+
+            self.assertRaises((TypeError, ValueError), test_index_type)
+
+            def test_axis_dtype():
+                paddle.gather(x, index, axis=1.11)
 
-                def test_axis_dtype():
-                    paddle.gather(x_float32, index, axis=1.11)
+            self.assertRaises((TypeError, ValueError), test_axis_dtype)
 
-                self.assertRaises(TypeError, test_axis_dtype)
+            def test_axis_dtype1():
+                paddle.gather(x, index, axis=axis)
 
-                def test_axis_dtype1():
-                    paddle.gather(x_float32, index, axis=axis)
+            self.assertRaises((TypeError, ValueError), test_axis_dtype1)
 
-                self.assertRaises(TypeError, test_axis_dtype1)
-        else:
-            paddle.set_device("metax_gpu")
-            input_shape = [8, 9, 6]
-            index_shape = [4]
+    def test_error2(self):
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
+            shape = [8, 9, 6]
+            x = paddle.static.data(shape=shape, dtype="int8", name="x")
+            index = paddle.static.data(shape=shape, dtype="int32", name="mask")
+            index_float = paddle.static.data(
+                shape=shape, dtype="float32", name="index_float"
+            )
+
+            def test_x_type():
+                paddle.gather(x, index)
+
+            self.assertRaises((TypeError, ValueError), test_x_type)
 
             def test_index_type():
-                with paddle.static.program_guard(
-                    paddle.static.Program(), paddle.static.Program()
-                ):
-                    x = paddle.static.data(shape=input_shape, dtype="float32", name="x")
-                    index = paddle.static.data(
-                        shape=index_shape, dtype="float32", name="index_float"
-                    )
-                    out = paddle.gather(x, index)
-                    exe = paddle.static.Executor(place=self.place)
-                    exe.run(paddle.static.default_startup_program())
-                    self.assertRaises(
-                        ValueError,
-                        exe.run,
-                        paddle.static.default_main_program(),
-                        feed={
-                            "x": np.random.random(input_shape).astype("float32"),
-                            "index_float": np.random.random(index_shape).astype(
-                                "float32"
-                            ),
-                        },
-                    )
-
-            def test_axis_scalar_dtype():
-                with paddle.static.program_guard(
-                    paddle.static.Program(), paddle.static.Program()
-                ):
-                    x = paddle.static.data(shape=input_shape, dtype="float32", name="x")
-                    index = paddle.static.data(
-                        shape=index_shape, dtype="int32", name="index"
-                    )
-                    axis = paddle.static.data(shape=[1], dtype="int32", name="axis")
-                    self.assertRaises(TypeError, paddle.gather, x, index, axis=1.11)
-
-            def test_axis_tensor_dtype():
-                with paddle.static.program_guard(
-                    paddle.static.Program(), paddle.static.Program()
-                ):
-                    x = paddle.static.data(shape=input_shape, dtype="float32", name="x")
-                    index = paddle.static.data(
-                        shape=index_shape, dtype="int32", name="index"
-                    )
-                    axis = paddle.static.data(shape=[1], dtype="float32", name="axis")
-                    y = paddle.gather(x, index, axis=axis)
-                    exe = paddle.static.Executor(place=self.place)
-                    exe.run(paddle.static.default_startup_program())
-                    self.assertRaises(
-                        ValueError,
-                        exe.run,
-                        paddle.static.default_main_program(),
-                        feed={
-                            "x": np.random.random(input_shape).astype("float32"),
-                            "index": np.random.randint(0, 8, index_shape).astype(
-                                "int32"
-                            ),
-                            "axis": np.array([1.11]).astype("float32"),
-                        },
-                    )
-
-            test_index_type()
-            test_axis_scalar_dtype()
-            # test_axis_tensor_dtype()
+                paddle.gather(x, index_float)
+
+            self.assertRaises((TypeError, ValueError), test_index_type)
+
+    def test_error3(self):
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
+            shape = [8, 9, 6]
+            x = paddle.static.data(shape=shape, dtype="int32", name="x")
+            index = paddle.static.data(shape=shape, dtype="int32", name="index")
+
+            def test_axis_minsize():
+                paddle.gather(x, index, axis=-1)
+
+            self.assertRaises(ValueError, test_axis_minsize)
+
+            def test_axis_maxsize():
+                paddle.gather(x, index, axis=512)
+
+            self.assertRaises(ValueError, test_axis_maxsize)
+
+
+class TestCheckOutType(unittest.TestCase):
+    def test_out_type(self):
+        data = paddle.static.data(shape=[16, 10], dtype="int64", name="x")
+        index = paddle.static.data(shape=[4], dtype="int64", name="index")
+        out = paddle.gather(data, index)
+        self.assertTrue(out.dtype == paddle.int64 or out.dtype == core.DataType.INT64)
+
+    def test_pir_out_type(self):
+        with paddle.pir_utils.IrGuard():
+            data = paddle.static.data(shape=[16, 10], dtype="int64", name="x")
+            index = paddle.static.data(shape=[4], dtype="int64", name="index")
+            out = paddle.gather(data, index)
+            self.assertTrue(out.dtype == core.DataType.INT64)
+
+
+class TestGatherBackward(unittest.TestCase):
+    def setUp(self):
+        self.shape = [10, 20]
+        self.dtype = "float32"
+        self.index = (1, 3, 5)
+        self.index_dtype = "int64"
+        self.places = get_devices()
+
+    def test_gather_backward(self):
+        if len(self.places) != 2:
+            return
+        res_list = []
+        x_np = np.random.random(self.shape).astype(self.dtype)
+        index_np = np.array(self.index, dtype=self.index_dtype)
+        grad_out_np = np.random.random(self.shape).astype(self.dtype)
+        for place in self.places:
+            with base.dygraph.guard(place):
+                x = paddle.to_tensor(x_np, dtype=self.dtype)
+                x.stop_gradient = False
+                index = paddle.to_tensor(index_np, dtype=self.index_dtype)
+                out = paddle.gather(x, index, -1)
+                grad_out = paddle.to_tensor(grad_out_np, dtype=self.dtype)
+                (re,) = paddle.grad(
+                    outputs=out,
+                    inputs=x,
+                    grad_outputs=grad_out,
+                )
+                res_list.append(re.numpy())
+        np.testing.assert_allclose(res_list[0], res_list[1])
+
+
+class TestGatherOp_ZeroSize(OpTest):
+    def setUp(self):
+        self.op_type = "gather"
+        self.python_api = paddle.gather
+        self.public_python_api = paddle.gather
+        self.config()
+        self.init_inputs_and_outputs()
+
+    def test_check_output(self):
+        self.check_output(check_pir=True)
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out", check_pir=True)
+
+    def config(self):
+        self.x_shape = (3, 0, 4)
+        self.config_dtype()
+        self.index = [2]
+        self.index_type = "int32"
+
+    def config_dtype(self):
+        self.x_type = "float64"
+
+    def init_inputs_and_outputs(self):
+        xnp = np.random.random(self.x_shape).astype(self.x_type)
+        self.inputs = {
+            "X": xnp,
+            "Index": np.array(self.index).astype(self.index_type),
+        }
+        self.outputs = {"Out": self.inputs["X"][self.inputs["Index"]]}
+
+
+class TestGatherOp_ZeroSize2(TestGatherOp_ZeroSize):
+    def config(self):
+        self.x_shape = (10, 20)
+        self.config_dtype()
+        self.index = [2, 0]
+        self.index_type = "int32"
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()

From 893829371efacbff859d0eb83c7ea827f5bb0124 Mon Sep 17 00:00:00 2001
From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Date: Thu, 11 Sep 2025 17:29:10 +0800
Subject: [PATCH 005/121] [Metax] update metax_gpu CMakeLists.txt (#10)

* [Metax] fix dgc & mklml compile product path problem

* [Metax] fix accuracy kernel & add test_accuracy_op_metax.py unit test

* [Metax] add mixed_vector fix & update change patch

* [Metax] update metax_gpu CMakeLists.txt
---
 backends/metax_gpu/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index 4567723123c..b22d7077e3b 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -26,11 +26,11 @@ set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake")
 message(STATUS "CMAKE_MODULE_PATH: ${CMAKE_MODULE_PATH}")
 set(WITH_MKLML ON)
 
+include(paddle)
 set(THIRD_PARTY_PATH
     "${PADDLE_SOURCE_DIR}/build/third_party"
     CACHE PATH "Third party libraries directory.")
 
-include(paddle)
 include(version)
 include(generic)
 include(cblas)

From f54187fb3e47ed8062537b9d339c48c7fd711326 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Thu, 11 Sep 2025 18:51:43 +0800
Subject: [PATCH 006/121] [metax] updata_qr_kernel (#11)

* [metax] chang patch fix copy

* [metax] chang patch fix copy

* [Metax] update metax_gpu unit test

* [Metax] fix test CMakeList.txt

* [metax]change_cupti_and_fix_softmax

* [metax]change_patch

* [metax]change_patch

* [metax] updata_qr_kernel

* [metax] updata_qr_kernel

---------

Co-authored-by: Mingkun.Zhang <2496808993@qq.com>
Co-authored-by: metax666 <metax_pde@outlook.com>
Co-authored-by: jiaxinWang-metax <189149612@qq.com>
Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Co-authored-by: chezhang <1376507468@qq.com>
Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com>
Co-authored-by: ZhouDuan <1184319564@qq.com>
---
 .../metax_kernel/qr_kernel_register.cu        | 207 +++++++++---------
 1 file changed, 98 insertions(+), 109 deletions(-)

diff --git a/backends/metax_gpu/kernels/metax_kernel/qr_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/qr_kernel_register.cu
index 7b133371f4d..745069e2eda 100644
--- a/backends/metax_gpu/kernels/metax_kernel/qr_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/qr_kernel_register.cu
@@ -22,9 +22,8 @@
 #include <algorithm>
 #include <vector>
 
-#include "kernels/impl/values_vectors_functor.h"
+#include "kernels/metax_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -39,7 +38,6 @@
 #include "paddle/phi/kernels/slice_kernel.h"
 #include "paddle/phi/kernels/transpose_kernel.h"
 #include "paddle/phi/kernels/tril_triu_kernel.h"
-
 namespace phi {
 
 template <class T, class Context>
@@ -358,47 +356,47 @@ void QrKernel(const Context& dev_ctx,
 
 #ifdef PADDLE_WITH_HIP
 #define FUNC_WITH_TYPES(m) m(float, s) m(double, d)
-#define GEQRF_BATCH_INSTANCE(T, C)                              \
-  template <>                                                   \
-  void BatchedGeqrf<GPUContext, T>(const GPUContext& dev_ctx,   \
-                                   int batch_size,              \
-                                   int m,                       \
-                                   int n,                       \
-                                   T* a,                        \
-                                   int lda,                     \
-                                   T* tau,                      \
-                                   int a_stride,                \
-                                   int tau_stride) {            \
-    auto handle = dev_ctx.cusolver_dn_handle();                 \
-    for (int i = 0; i < batch_size; ++i) {                      \
-      T* a_working_ptr = &a[i * a_stride];                      \
-      T* tau_working_ptr = &tau[i * tau_stride];                \
-      PADDLE_ENFORCE_GPU_SUCCESS(dynload::rocsolver_##C##geqrf( \
-          handle, m, n, a_working_ptr, lda, tau_working_ptr));  \
-    }                                                           \
+#define GEQRF_BATCH_INSTANCE(T, C)                                           \
+  template <>                                                                \
+  void BatchedGeqrf<GPUContext, T>(const GPUContext& dev_ctx,                \
+                                   int batch_size,                           \
+                                   int m,                                    \
+                                   int n,                                    \
+                                   T* a,                                     \
+                                   int lda,                                  \
+                                   T* tau,                                   \
+                                   int a_stride,                             \
+                                   int tau_stride) {                         \
+    auto handle = GetCusolverDnHandle(dev_ctx.stream(), dev_ctx.GetPlace()); \
+    for (int i = 0; i < batch_size; ++i) {                                   \
+      T* a_working_ptr = &a[i * a_stride];                                   \
+      T* tau_working_ptr = &tau[i * tau_stride];                             \
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::rocsolver_##C##geqrf(              \
+          handle, m, n, a_working_ptr, lda, tau_working_ptr));               \
+    }                                                                        \
   }
 
 FUNC_WITH_TYPES(GEQRF_BATCH_INSTANCE);
 
-#define ORGQR_BATCH_INSTANCE(T, C)                                \
-  template <>                                                     \
-  void BatchedOrgqr<GPUContext, T>(const GPUContext& dev_ctx,     \
-                                   int batch_size,                \
-                                   int m,                         \
-                                   int n,                         \
-                                   int k,                         \
-                                   T* a,                          \
-                                   int lda,                       \
-                                   T* tau,                        \
-                                   int a_stride,                  \
-                                   int tau_stride) {              \
-    auto handle = dev_ctx.cusolver_dn_handle();                   \
-    for (int i = 0; i < batch_size; ++i) {                        \
-      T* a_working_ptr = &a[i * a_stride];                        \
-      T* tau_working_ptr = &tau[i * tau_stride];                  \
-      PADDLE_ENFORCE_GPU_SUCCESS(dynload::rocsolver_##C##orgqr(   \
-          handle, m, n, k, a_working_ptr, lda, tau_working_ptr)); \
-    }                                                             \
+#define ORGQR_BATCH_INSTANCE(T, C)                                           \
+  template <>                                                                \
+  void BatchedOrgqr<GPUContext, T>(const GPUContext& dev_ctx,                \
+                                   int batch_size,                           \
+                                   int m,                                    \
+                                   int n,                                    \
+                                   int k,                                    \
+                                   T* a,                                     \
+                                   int lda,                                  \
+                                   T* tau,                                   \
+                                   int a_stride,                             \
+                                   int tau_stride) {                         \
+    auto handle = GetCusolverDnHandle(dev_ctx.stream(), dev_ctx.GetPlace()); \
+    for (int i = 0; i < batch_size; ++i) {                                   \
+      T* a_working_ptr = &a[i * a_stride];                                   \
+      T* tau_working_ptr = &tau[i * tau_stride];                             \
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::rocsolver_##C##orgqr(              \
+          handle, m, n, k, a_working_ptr, lda, tau_working_ptr));            \
+    }                                                                        \
   }
 
 FUNC_WITH_TYPES(ORGQR_BATCH_INSTANCE);
@@ -421,7 +419,6 @@ void BatchedGeqrf<GPUContext, float>(const GPUContext& dev_ctx,
     const int64_t a_stride_64 = static_cast<int64_t>(a_stride);
     const int64_t tau_stride_64 = static_cast<int64_t>(tau_stride);
 
-    // auto handle = dev_ctx.cusolver_dn_handle();
     auto handle = GetCusolverDnHandle(dev_ctx.stream(), dev_ctx.GetPlace());
 
     size_t workspace_in_bytes_on_device = 0;
@@ -499,7 +496,6 @@ void BatchedGeqrf<GPUContext, float>(const GPUContext& dev_ctx,
   } else {
     int lwork = 0;
 
-    // auto handle = dev_ctx.cusolver_dn_handle();
     auto handle = GetCusolverDnHandle(dev_ctx.stream(), dev_ctx.GetPlace());
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnSgeqrf_bufferSize(
         handle, m, n, a, lda, &lwork));
@@ -555,7 +551,6 @@ void BatchedGeqrf<GPUContext, double>(const GPUContext& dev_ctx,
                                       int tau_stride) {
   int lwork = 0;
 
-  // auto handle = dev_ctx.cusolver_dn_handle();
   auto handle = GetCusolverDnHandle(dev_ctx.stream(), dev_ctx.GetPlace());
   PADDLE_ENFORCE_GPU_SUCCESS(
       phi::dynload::cusolverDnDgeqrf_bufferSize(handle, m, n, a, lda, &lwork));
@@ -599,35 +594,33 @@ void BatchedGeqrf<GPUContext, double>(const GPUContext& dev_ctx,
 }
 
 template <>
-void BatchedGeqrf<GPUContext, phi::dtype::complex<float>>(
-    const GPUContext& dev_ctx,
-    int batch_size,
-    int m,
-    int n,
-    phi::dtype::complex<float>* a,
-    int lda,
-    phi::dtype::complex<float>* tau,
-    int a_stride,
-    int tau_stride) {
+void BatchedGeqrf<GPUContext, phi::complex64>(const GPUContext& dev_ctx,
+                                              int batch_size,
+                                              int m,
+                                              int n,
+                                              phi::complex64* a,
+                                              int lda,
+                                              phi::complex64* tau,
+                                              int a_stride,
+                                              int tau_stride) {
   int lwork = 0;
 
-  // auto handle = dev_ctx.cusolver_dn_handle();
   auto handle = GetCusolverDnHandle(dev_ctx.stream(), dev_ctx.GetPlace());
   PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnCgeqrf_bufferSize(
       handle, m, n, reinterpret_cast<cuComplex*>(a), lda, &lwork));
 
   DenseTensor workspace = DenseTensor();
   workspace.Resize(common::make_ddim({lwork}));
-  phi::dtype::complex<float>* workspace_ptr =
-      dev_ctx.template Alloc<phi::dtype::complex<float>>(&workspace);
+  phi::complex64* workspace_ptr =
+      dev_ctx.template Alloc<phi::complex64>(&workspace);
 
   DenseTensor info = DenseTensor();
   info.Resize(common::make_ddim({1}));
   int* info_d = dev_ctx.template Alloc<int>(&info);
 
   for (int i = 0; i < batch_size; ++i) {
-    phi::dtype::complex<float>* a_working_ptr = &a[i * a_stride];
-    phi::dtype::complex<float>* tau_working_ptr = &tau[i * tau_stride];
+    phi::complex64* a_working_ptr = &a[i * a_stride];
+    phi::complex64* tau_working_ptr = &tau[i * tau_stride];
     // compute geqrf
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnCgeqrf(
         handle,
@@ -657,35 +650,33 @@ void BatchedGeqrf<GPUContext, phi::dtype::complex<float>>(
 }
 
 template <>
-void BatchedGeqrf<GPUContext, phi::dtype::complex<double>>(
-    const GPUContext& dev_ctx,
-    int batch_size,
-    int m,
-    int n,
-    phi::dtype::complex<double>* a,
-    int lda,
-    phi::dtype::complex<double>* tau,
-    int a_stride,
-    int tau_stride) {
+void BatchedGeqrf<GPUContext, phi::complex128>(const GPUContext& dev_ctx,
+                                               int batch_size,
+                                               int m,
+                                               int n,
+                                               phi::complex128* a,
+                                               int lda,
+                                               phi::complex128* tau,
+                                               int a_stride,
+                                               int tau_stride) {
   int lwork = 0;
 
-  // auto handle = dev_ctx.cusolver_dn_handle();
   auto handle = GetCusolverDnHandle(dev_ctx.stream(), dev_ctx.GetPlace());
   PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnZgeqrf_bufferSize(
       handle, m, n, reinterpret_cast<cuDoubleComplex*>(a), lda, &lwork));
 
   DenseTensor workspace = DenseTensor();
   workspace.Resize(common::make_ddim({lwork}));
-  phi::dtype::complex<double>* workspace_ptr =
-      dev_ctx.template Alloc<phi::dtype::complex<double>>(&workspace);
+  phi::complex128* workspace_ptr =
+      dev_ctx.template Alloc<phi::complex128>(&workspace);
 
   DenseTensor info = DenseTensor();
   info.Resize(common::make_ddim({1}));
   int* info_d = dev_ctx.template Alloc<int>(&info);
 
   for (int i = 0; i < batch_size; ++i) {
-    phi::dtype::complex<double>* a_working_ptr = &a[i * a_stride];
-    phi::dtype::complex<double>* tau_working_ptr = &tau[i * tau_stride];
+    phi::complex128* a_working_ptr = &a[i * a_stride];
+    phi::complex128* tau_working_ptr = &tau[i * tau_stride];
     // compute geqrf
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnZgeqrf(
         handle,
@@ -727,7 +718,6 @@ void BatchedOrgqr<GPUContext, float>(const GPUContext& dev_ctx,
                                      int tau_stride) {
   int lwork = 0;
 
-  // auto handle = dev_ctx.cusolver_dn_handle();
   auto handle = GetCusolverDnHandle(dev_ctx.stream(), dev_ctx.GetPlace());
   PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnSorgqr_bufferSize(
       handle, m, n, k, a, lda, tau, &lwork));
@@ -784,7 +774,6 @@ void BatchedOrgqr<GPUContext, double>(const GPUContext& dev_ctx,
                                       int tau_stride) {
   int lwork = 0;
 
-  // auto handle = dev_ctx.cusolver_dn_handle();
   auto handle = GetCusolverDnHandle(dev_ctx.stream(), dev_ctx.GetPlace());
   PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnDorgqr_bufferSize(
       handle, m, n, k, a, lda, tau, &lwork));
@@ -829,20 +818,18 @@ void BatchedOrgqr<GPUContext, double>(const GPUContext& dev_ctx,
 }
 
 template <>
-void BatchedOrgqr<GPUContext, phi::dtype::complex<float>>(
-    const GPUContext& dev_ctx,
-    int batch_size,
-    int m,
-    int n,
-    int k,
-    phi::dtype::complex<float>* a,
-    int lda,
-    phi::dtype::complex<float>* tau,
-    int a_stride,
-    int tau_stride) {
+void BatchedOrgqr<GPUContext, phi::complex64>(const GPUContext& dev_ctx,
+                                              int batch_size,
+                                              int m,
+                                              int n,
+                                              int k,
+                                              phi::complex64* a,
+                                              int lda,
+                                              phi::complex64* tau,
+                                              int a_stride,
+                                              int tau_stride) {
   int lwork = 0;
 
-  // auto handle = dev_ctx.cusolver_dn_handle();
   auto handle = GetCusolverDnHandle(dev_ctx.stream(), dev_ctx.GetPlace());
   PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnCungqr_bufferSize(
       handle,
@@ -856,16 +843,16 @@ void BatchedOrgqr<GPUContext, phi::dtype::complex<float>>(
 
   DenseTensor workspace = DenseTensor();
   workspace.Resize(common::make_ddim({lwork}));
-  phi::dtype::complex<float>* workspace_ptr =
-      dev_ctx.template Alloc<phi::dtype::complex<float>>(&workspace);
+  phi::complex64* workspace_ptr =
+      dev_ctx.template Alloc<phi::complex64>(&workspace);
 
   DenseTensor info = DenseTensor();
   info.Resize(common::make_ddim({1}));
   int* info_d = dev_ctx.template Alloc<int>(&info);
 
   for (int i = 0; i < batch_size; ++i) {
-    phi::dtype::complex<float>* a_working_ptr = &a[i * a_stride];
-    phi::dtype::complex<float>* tau_working_ptr = &tau[i * tau_stride];
+    phi::complex64* a_working_ptr = &a[i * a_stride];
+    phi::complex64* tau_working_ptr = &tau[i * tau_stride];
     // compute orggr
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnCungqr(
         handle,
@@ -896,20 +883,18 @@ void BatchedOrgqr<GPUContext, phi::dtype::complex<float>>(
 }
 
 template <>
-void BatchedOrgqr<GPUContext, phi::dtype::complex<double>>(
-    const GPUContext& dev_ctx,
-    int batch_size,
-    int m,
-    int n,
-    int k,
-    phi::dtype::complex<double>* a,
-    int lda,
-    phi::dtype::complex<double>* tau,
-    int a_stride,
-    int tau_stride) {
+void BatchedOrgqr<GPUContext, phi::complex128>(const GPUContext& dev_ctx,
+                                               int batch_size,
+                                               int m,
+                                               int n,
+                                               int k,
+                                               phi::complex128* a,
+                                               int lda,
+                                               phi::complex128* tau,
+                                               int a_stride,
+                                               int tau_stride) {
   int lwork = 0;
 
-  // auto handle = dev_ctx.cusolver_dn_handle();
   auto handle = GetCusolverDnHandle(dev_ctx.stream(), dev_ctx.GetPlace());
   PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnZungqr_bufferSize(
       handle,
@@ -923,16 +908,16 @@ void BatchedOrgqr<GPUContext, phi::dtype::complex<double>>(
 
   DenseTensor workspace = DenseTensor();
   workspace.Resize(common::make_ddim({lwork}));
-  phi::dtype::complex<double>* workspace_ptr =
-      dev_ctx.template Alloc<phi::dtype::complex<double>>(&workspace);
+  phi::complex128* workspace_ptr =
+      dev_ctx.template Alloc<phi::complex128>(&workspace);
 
   DenseTensor info = DenseTensor();
   info.Resize(common::make_ddim({1}));
   int* info_d = dev_ctx.template Alloc<int>(&info);
 
   for (int i = 0; i < batch_size; ++i) {
-    phi::dtype::complex<double>* a_working_ptr = &a[i * a_stride];
-    phi::dtype::complex<double>* tau_working_ptr = &tau[i * tau_stride];
+    phi::complex128* a_working_ptr = &a[i * a_stride];
+    phi::complex128* tau_working_ptr = &tau[i * tau_stride];
     // compute orggr
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnZungqr(
         handle,
@@ -965,11 +950,15 @@ void BatchedOrgqr<GPUContext, phi::dtype::complex<double>>(
 
 }  // namespace phi
 
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(qr, GPU, ALL_LAYOUT, phi::QrKernel, float, double) {}
+#else
 PD_REGISTER_PLUGIN_KERNEL(qr,
                           metax_gpu,
                           ALL_LAYOUT,
                           phi::QrKernel,
                           float,
                           double,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::complex64,
+                          phi::complex128) {}
+#endif

From 1e042162a9f7cbb4c08b260bae373122fee1e827 Mon Sep 17 00:00:00 2001
From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Date: Mon, 15 Sep 2025 10:30:01 +0800
Subject: [PATCH 007/121] [Metax] fix illegal address access error in
 test_momentum_op (#12)

* [Metax] fix illegal address access error in test_momentum_op
---
 backends/metax_gpu/patch/tmp/mixed_vector.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/backends/metax_gpu/patch/tmp/mixed_vector.h b/backends/metax_gpu/patch/tmp/mixed_vector.h
index e7cf1e626c9..1dcca9c71b4 100644
--- a/backends/metax_gpu/patch/tmp/mixed_vector.h
+++ b/backends/metax_gpu/patch/tmp/mixed_vector.h
@@ -386,7 +386,8 @@ class MixVector {
 
   // the unify method to access CPU or CUDA data. immutable.
   const T *Data(phi::Place place) const {
-    if (place.GetType() == phi::AllocationType::GPU) {
+    if (place.GetType() == phi::AllocationType::GPU ||
+        place.GetType() == phi::AllocationType::CUSTOM) {
       return CUDAData(place);
     } else {
       return data();
@@ -395,7 +396,8 @@ class MixVector {
 
   // the unify method to access CPU or CUDA data. mutable.
   T *MutableData(phi::Place place) {
-    if (place.GetType() == phi::AllocationType::GPU) {
+    if (place.GetType() == phi::AllocationType::GPU ||
+        place.GetType() == phi::AllocationType::CUSTOM) {
       return CUDAMutableData(place);
     } else {
       return data();

From aca80a41f6f619d995f5944c584c3141fab3ce9e Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Mon, 15 Sep 2025 11:41:10 +0800
Subject: [PATCH 008/121] [Metax] fix cufft and fix some blas kernel apply
 (#13)

* [Metax] fix cufft and fix some blas kernel apply
---
 backends/metax_gpu/CMakeLists.txt     | 13 ++----
 backends/metax_gpu/patch/paddle.patch | 59 +++++++++++++++++++++++++++
 2 files changed, 63 insertions(+), 9 deletions(-)

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index b22d7077e3b..6048b59e6c1 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -618,6 +618,7 @@ file(
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/bernoulli_kernel.cu
   # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/bmm_grad_kernel_impl.h
   # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/bmm_kernel.cu
+  ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/cufft.cc
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/box_coder_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
@@ -683,15 +684,9 @@ file(
   ${CMAKE_SOURCE_DIR}/kernels/flash_attn_kernel.cu
   ${CMAKE_SOURCE_DIR}/kernels/flashattn.cc)
 
-list(
-  REMOVE_ITEM
-  CUDA_SRCS
-  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/gru_compute.cu
-  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
-  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/softmax.cu
-  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/weight_only_gemv.cu
-  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/math/context_project.cu
-  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/fft.cu)
+list(REMOVE_ITEM CUDA_SRCS
+     ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/softmax.cu
+     ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/weight_only_gemv.cu)
 
 file(
   GLOB
diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch
index 1935217baa0..8127caee61e 100755
--- a/backends/metax_gpu/patch/paddle.patch
+++ b/backends/metax_gpu/patch/paddle.patch
@@ -133,6 +133,26 @@ index c0080f0a5e..458ca3e2e8 100644
  }  // namespace dynload
  }  // namespace phi
  
+diff --git a/paddle/phi/backends/dynload/cufft.h b/paddle/phi/backends/dynload/cufft.h
+index 1547909d92..66b2779392 100644
+--- a/paddle/phi/backends/dynload/cufft.h
++++ b/paddle/phi/backends/dynload/cufft.h
+@@ -1,3 +1,4 @@
++// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved.   
+ /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+@@ -40,7 +41,9 @@ extern void EnforceCUFFTLoaded(const char* fn_name);
+         cufft_dso_handle = phi::dynload::GetCUFFTDsoHandle();        \
+       });                                                            \
+       EnforceCUFFTLoaded(#__name);                                   \
+-      static void* p_##__name = dlsym(cufft_dso_handle, #__name);    \
++      std::string replaced_name = #__name;                                  \
++      replaced_name =  replaced_name.replace(0,2,"mc");          \
++      static void* p_##__name = dlsym(cufft_dso_handle, replaced_name.c_str());    \
+       return reinterpret_cast<cufft_func>(p_##__name)(args...);      \
+     }                                                                \
+   };                                                                 \
 diff --git a/paddle/phi/backends/dynload/cupti.h b/paddle/phi/backends/dynload/cupti.h
 index 59e92955c9..d2f8c2da15 100644
 --- a/paddle/phi/backends/dynload/cupti.h
@@ -437,6 +457,32 @@ index cb35feee32..64f5bd24ac 100644
  #include "paddle/phi/kernels/funcs/quant_dequant.h"
  #include "paddle/phi/kernels/matmul_kernel.h"
  
+diff --git a/paddle/phi/kernels/funcs/gru_compute.cu b/paddle/phi/kernels/funcs/gru_compute.cu
+index 88663ec880..98b93072a3 100644
+--- a/paddle/phi/kernels/funcs/gru_compute.cu
++++ b/paddle/phi/kernels/funcs/gru_compute.cu
+@@ -12,7 +12,7 @@ limitations under the License. */
+ #include "paddle/phi/kernels/funcs/gru_compute.h"
+ 
+ #include "paddle/phi/backends/gpu/gpu_context.h"
+-#include "paddle/phi/kernels/funcs/blas/blas.h"
++#include "kernels/funcs/blas/blas.h"
+ #include "paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h"
+ #include "paddle/phi/kernels/funcs/detail/gru_kernel.h"
+ 
+diff --git a/paddle/phi/kernels/funcs/math/context_project.h b/paddle/phi/kernels/funcs/math/context_project.h
+index 15e1a4a3c3..e4780538d7 100644
+--- a/paddle/phi/kernels/funcs/math/context_project.h
++++ b/paddle/phi/kernels/funcs/math/context_project.h
+@@ -18,7 +18,7 @@
+ #include <vector>
+ 
+ #include "paddle/phi/core/tensor_utils.h"
+-#include "paddle/phi/kernels/funcs/blas/blas.h"
++#include "kernels/funcs/blas/blas.h"
+ #include "paddle/phi/kernels/funcs/im2col.h"
+ 
+ namespace phi {
 diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cu b/paddle/phi/kernels/funcs/matrix_inverse.cu
 index e101224970..a52eb6096f 100644
 --- a/paddle/phi/kernels/funcs/matrix_inverse.cu
@@ -469,6 +515,19 @@ index 558d363b39..05da04b517 100644
  #include "paddle/phi/kernels/funcs/math_function.h"
  #include "paddle/phi/kernels/funcs/scatter.cu.h"
  
+diff --git a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
+index 8b0baf5f5f..260482f124 100644
+--- a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
++++ b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
+@@ -27,7 +27,7 @@ namespace cub = hipcub;
+ 
+ #include "paddle/phi/kernels/funcs/multihead_matmul_functor.h"
+ 
+-#include "paddle/phi/kernels/funcs/blas/blas.h"
++#include "kernels/funcs/blas/blas.h"
+ #include "paddle/phi/kernels/funcs/math_cuda_utils.h"
+ 
+ namespace phi {
 diff --git a/paddle/phi/kernels/funcs/top_k_function_cuda.h b/paddle/phi/kernels/funcs/top_k_function_cuda.h
 index e30d440ff3..3c74792690 100644
 --- a/paddle/phi/kernels/funcs/top_k_function_cuda.h

From fb547db298546f2c3249e22886c2232ba4882987 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Mon, 15 Sep 2025 16:04:35 +0800
Subject: [PATCH 009/121] [metax] add warpctc_warprnn (#14)

* [metax] fix bug
---
 backends/metax_gpu/CMakeLists.txt             |   2 +
 backends/metax_gpu/change_patch.sh            |   1 +
 backends/metax_gpu/cmake/warpctc.cmake        | 149 ++++++
 backends/metax_gpu/cmake/warprnnt.cmake       | 142 ++++++
 .../warpctc_grad_kernel_register.cu           |   2 +-
 .../cuda_kernels/warpctc_kernel_register.cu   |   2 +-
 .../kernels/impl/warpctc_kernel_impl.h        |   3 +-
 .../kernels/impl/warprnnt_kernel_impl.h       |   6 +-
 backends/metax_gpu/patch/intrinsics.cuh       | 459 ++++++++++++++++++
 backends/metax_gpu/patch/paddle.patch         |  26 +
 10 files changed, 787 insertions(+), 5 deletions(-)
 create mode 100644 backends/metax_gpu/cmake/warpctc.cmake
 create mode 100644 backends/metax_gpu/cmake/warprnnt.cmake
 create mode 100644 backends/metax_gpu/patch/intrinsics.cuh

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index 6048b59e6c1..cca23ab42f5 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -37,6 +37,8 @@ include(cblas)
 include(flashattn)
 include(cutlass)
 include(dgc)
+include(warpctc)
+include(warprnnt)
 
 set(PLUGIN_VERSION ${PADDLE_VERSION})
 
diff --git a/backends/metax_gpu/change_patch.sh b/backends/metax_gpu/change_patch.sh
index 833ae00f6bd..60d74ec0f3d 100644
--- a/backends/metax_gpu/change_patch.sh
+++ b/backends/metax_gpu/change_patch.sh
@@ -25,3 +25,4 @@ cp patch/tmp/mixed_vector* ../../Paddle/paddle/phi/core
 cd ../../Paddle/
 git apply --verbose ../backends/metax_gpu/patch/paddle.patch
 cd -
+cp -r patch/intrinsics.cuh ../../Paddle/third_party/warpctc/include/contrib/moderngpu/include/device/
diff --git a/backends/metax_gpu/cmake/warpctc.cmake b/backends/metax_gpu/cmake/warpctc.cmake
new file mode 100644
index 00000000000..71c892a6cfa
--- /dev/null
+++ b/backends/metax_gpu/cmake/warpctc.cmake
@@ -0,0 +1,149 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License. You may obtain a copy of
+# the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations under
+# the License.
+
+include(ExternalProject)
+
+if(WITH_ROCM)
+  add_definitions(-DWARPCTC_WITH_HIP)
+endif()
+
+set(WARPCTC_PREFIX_DIR ${THIRD_PARTY_PATH}/warpctc)
+set(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
+# in case of low internet speed set(WARPCTC_REPOSITORY
+# https://gitee.com/tianjianhe/warp-ctc.git)
+set(WARPCTC_TAG bdc2b4550453e0ef2d3b5190f9c6103a84eff184)
+set(SOURCE_DIR ${PADDLE_SOURCE_DIR}/third_party/warpctc)
+set(WARPCTC_PATCH_COMMAND "")
+set(WARPCTC_CCBIN_OPTION "")
+if(WIN32)
+  set(WARPCTC_PATCH_CUDA_COMMAND
+      git checkout -- . && git checkout ${WARPCTC_TAG} && git apply
+      ${PADDLE_SOURCE_DIR}/patches/warpctc/CMakeLists.txt.cuda.patch)
+else()
+  set(WARPCTC_PATCH_CUDA_COMMAND
+      git checkout -- . && git checkout ${WARPCTC_TAG} && patch -Nd
+      ${SOURCE_DIR} <
+      ${PADDLE_SOURCE_DIR}/patches/warpctc/CMakeLists.txt.cuda.patch)
+endif()
+
+if(NOT WIN32 AND WITH_GPU)
+  if(${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0 AND ${CMAKE_CXX_COMPILER_VERSION}
+                                                  VERSION_GREATER 12.0)
+    file(TO_NATIVE_PATH
+         ${PADDLE_SOURCE_DIR}/patches/warpctc/CMakeLists.txt.patch native_src)
+    set(WARPCTC_PATCH_COMMAND git checkout -- . && git checkout ${WARPCTC_TAG}
+                              && patch -Nd ${SOURCE_DIR} < ${native_src} &&)
+    set(WARPCTC_CCBIN_OPTION -DCCBIN_COMPILER=${CCBIN_COMPILER})
+  endif()
+endif()
+
+if(WITH_ROCM)
+  set(WARPCTC_PATHCH_ROCM_COMMAND
+      patch -p1 <
+      ${PADDLE_SOURCE_DIR}/patches/warpctc/CMakeLists.txt.rocm.patch && patch
+      -p1 < ${PADDLE_SOURCE_DIR}/patches/warpctc/devicetypes.cuh.patch && patch
+      -p1 < ${PADDLE_SOURCE_DIR}/patches/warpctc/hip.cmake.patch)
+endif()
+
+set(WARPCTC_INCLUDE_DIR
+    "${WARPCTC_INSTALL_DIR}/include"
+    CACHE PATH "Warp-ctc Directory" FORCE)
+# Used in unit test test_WarpCTCLayer
+set(WARPCTC_LIB_DIR
+    "${WARPCTC_INSTALL_DIR}/lib"
+    CACHE PATH "Warp-ctc Library Directory" FORCE)
+
+if(WIN32)
+  set(WARPCTC_LIBRARIES
+      "${WARPCTC_INSTALL_DIR}/bin/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}"
+      CACHE FILEPATH "Warp-ctc Library" FORCE)
+else()
+  set(WARPCTC_LIBRARIES
+      "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_SHARED_LIBRARY_SUFFIX}"
+      CACHE FILEPATH "Warp-ctc Library" FORCE)
+endif()
+
+if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang"
+   OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang"
+   OR WIN32)
+  set(USE_OMP OFF)
+else()
+  set(USE_OMP ON)
+endif()
+
+if(WIN32)
+  set(WARPCTC_C_FLAGS $<FILTER:${CMAKE_C_FLAGS},EXCLUDE,/Zc:inline>)
+  set(WARPCTC_C_FLAGS_DEBUG $<FILTER:${CMAKE_C_FLAGS_DEBUG},EXCLUDE,/Zc:inline>)
+  set(WARPCTC_C_FLAGS_RELEASE
+      $<FILTER:${CMAKE_C_FLAGS_RELEASE},EXCLUDE,/Zc:inline>)
+  set(WARPCTC_CXX_FLAGS $<FILTER:${CMAKE_CXX_FLAGS},EXCLUDE,/Zc:inline>)
+  set(WARPCTC_CXX_FLAGS_RELEASE
+      $<FILTER:${CMAKE_CXX_FLAGS_RELEASE},EXCLUDE,/Zc:inline>)
+  set(WARPCTC_CXX_FLAGS_DEBUG
+      $<FILTER:${CMAKE_CXX_FLAGS_DEBUG},EXCLUDE,/Zc:inline>)
+else()
+  set(WARPCTC_C_FLAGS ${CMAKE_C_FLAGS})
+  set(WARPCTC_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
+  set(WARPCTC_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE})
+  set(WARPCTC_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+  set(WARPCTC_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
+  set(WARPCTC_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
+endif()
+
+ExternalProject_Add(
+  extern_warpctc
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  SOURCE_DIR ${SOURCE_DIR}
+  PREFIX ${WARPCTC_PREFIX_DIR}
+  UPDATE_COMMAND ""
+  PATCH_COMMAND
+  COMMAND ${WARPCTC_PATCH_COMMAND}
+  COMMAND ${WARPCTC_PATCH_CUDA_COMMAND}
+  COMMAND ${WARPCTC_PATHCH_ROCM_COMMAND}
+  # BUILD_ALWAYS    1
+  CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+             -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+             -DCMAKE_C_FLAGS=${WARPCTC_C_FLAGS}
+             -DCMAKE_C_FLAGS_DEBUG=${WARPCTC_C_FLAGS_DEBUG}
+             -DCMAKE_C_FLAGS_RELEASE=${WARPCTC_C_FLAGS_RELEASE}
+             -DCMAKE_CXX_FLAGS=${WARPCTC_CXX_FLAGS}
+             -DCMAKE_CXX_FLAGS_RELEASE=${WARPCTC_CXX_FLAGS_RELEASE}
+             -DCMAKE_CXX_FLAGS_DEBUG=${WARPCTC_CXX_FLAGS_DEBUG}
+             -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
+             -DWITH_GPU=${WITH_GPU}
+             -DWITH_ROCM=${WITH_ROCM}
+             -DWITH_OMP=${USE_OMP}
+             -DNVCC_FLAGS_EXTRA=${NVCC_FLAGS_EXTRA}
+             -DWITH_TORCH=OFF
+             -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
+             -DBUILD_SHARED=ON
+             -DBUILD_TESTS=OFF
+             -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+             -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+             -DCUDA_TOOLKIT_ROOT_DIR=${CUDA_TOOLKIT_ROOT_DIR}
+             ${EXTERNAL_OPTIONAL_ARGS}
+             ${WARPCTC_CCBIN_OPTION}
+  CMAKE_CACHE_ARGS
+    -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+    -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+    -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
+  BUILD_BYPRODUCTS ${WARPCTC_LIBRARIES})
+
+message(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}")
+get_filename_component(WARPCTC_LIBRARY_PATH ${WARPCTC_LIBRARIES} DIRECTORY)
+include_directories(${WARPCTC_INCLUDE_DIR}) # For warpctc code to include its
+                                            # headers.
+
+add_library(warpctc INTERFACE)
+add_dependencies(warpctc extern_warpctc)
diff --git a/backends/metax_gpu/cmake/warprnnt.cmake b/backends/metax_gpu/cmake/warprnnt.cmake
new file mode 100644
index 00000000000..54a7ad6be86
--- /dev/null
+++ b/backends/metax_gpu/cmake/warprnnt.cmake
@@ -0,0 +1,142 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License. You may obtain a copy of
+# the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations under
+# the License.
+
+include(ExternalProject)
+
+if(WITH_ROCM)
+  add_definitions(-DWARPRNNT_WITH_HIP)
+endif()
+
+set(WARPRNNT_PREFIX_DIR ${THIRD_PARTY_PATH}/warprnnt)
+set(WARPRNNT_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warprnnt)
+set(WARPRNNT_TAG 7ea6bfe748779c245a0fcaa5dd9383826273eff2)
+set(SOURCE_DIR ${PADDLE_SOURCE_DIR}/third_party/warprnnt)
+set(WARPRNNT_PATCH_COMMAND "")
+set(WARPRNNT_CCBIN_OPTION "")
+if(WIN32)
+  set(WARPCTC_PATCH_CUDA_COMMAND
+      ${CMAKE_COMMAND} -E copy_if_different
+      ${PADDLE_SOURCE_DIR}/patches/warprnnt/CMakeLists.txt.cuda.patch
+      "<SOURCE_DIR>/")
+else()
+  set(WARPCTC_PATCH_CUDA_COMMAND
+      git checkout -- . && git checkout ${WARPRNNT_TAG} && patch -Nd
+      ${SOURCE_DIR} <
+      ${PADDLE_SOURCE_DIR}/patches/warprnnt/CMakeLists.txt.cuda.patch)
+endif()
+if(WITH_ROCM)
+  set(WARPRNNT_PATCH_ROCM_COMMAND
+      patch -p1 <
+      ${PADDLE_SOURCE_DIR}/patches/warprnnt/CMakeLists.txt.rocm.patch)
+endif()
+if(NOT WIN32 AND WITH_GPU)
+  if(${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0 AND ${CMAKE_CXX_COMPILER_VERSION}
+                                                  VERSION_GREATER 12.0)
+    file(TO_NATIVE_PATH
+         ${PADDLE_SOURCE_DIR}/patches/warprnnt/CMakeLists.txt.patch native_src)
+    set(WARPRNNT_PATCH_COMMAND
+        git checkout -- . && git checkout ${WARPRNNT_TAG} && patch -Nd
+        ${SOURCE_DIR} < ${native_src})
+    set(WARPRNNT_CCBIN_OPTION -DCCBIN_COMPILER=${CCBIN_COMPILER})
+  endif()
+endif()
+
+set(WARPRNNT_INCLUDE_DIR
+    "${WARPRNNT_INSTALL_DIR}/include"
+    CACHE PATH "Warp-rnnt Directory" FORCE)
+# Used in unit test test_WarpCTCLayer
+set(WARPRNNT_LIB_DIR
+    "${WARPRNNT_INSTALL_DIR}/lib"
+    CACHE PATH "Warp-rnnt Library Directory" FORCE)
+
+if(WIN32)
+  set(WARPRNNT_LIBRARIES
+      "${WARPRNNT_INSTALL_DIR}/bin/warprnnt${CMAKE_SHARED_LIBRARY_SUFFIX}"
+      CACHE FILEPATH "Warp-rnnt Library" FORCE)
+else()
+  set(WARPRNNT_LIBRARIES
+      "${WARPRNNT_INSTALL_DIR}/lib/libwarprnnt${CMAKE_SHARED_LIBRARY_SUFFIX}"
+      CACHE FILEPATH "Warp-rnnt Library" FORCE)
+endif()
+
+if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang"
+   OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang"
+   OR WIN32)
+  set(USE_OMP OFF)
+else()
+  set(USE_OMP ON)
+endif()
+
+if(WIN32)
+  set(WARPRNNT_C_FLAGS $<FILTER:${CMAKE_C_FLAGS},EXCLUDE,/Zc:inline>)
+  set(WARPRNNT_C_FLAGS_DEBUG
+      $<FILTER:${CMAKE_C_FLAGS_DEBUG},EXCLUDE,/Zc:inline>)
+  set(WARPRNNT_C_FLAGS_RELEASE
+      $<FILTER:${CMAKE_C_FLAGS_RELEASE},EXCLUDE,/Zc:inline>)
+  set(WARPRNNT_CXX_FLAGS $<FILTER:${CMAKE_CXX_FLAGS},EXCLUDE,/Zc:inline>)
+  set(WARPRNNT_CXX_FLAGS_RELEASE
+      $<FILTER:${CMAKE_CXX_FLAGS_RELEASE},EXCLUDE,/Zc:inline>)
+  set(WARPRNNT_CXX_FLAGS_DEBUG
+      $<FILTER:${CMAKE_CXX_FLAGS_DEBUG},EXCLUDE,/Zc:inline>)
+else()
+  set(WARPRNNT_C_FLAGS ${CMAKE_C_FLAGS})
+  set(WARPRNNT_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
+  set(WARPRNNT_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE})
+  set(WARPRNNT_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+  set(WARPRNNT_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
+  set(WARPRNNT_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
+endif()
+ExternalProject_Add(
+  extern_warprnnt
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  SOURCE_DIR ${SOURCE_DIR}
+  PREFIX ${WARPRNNT_PREFIX_DIR}
+  UPDATE_COMMAND ""
+  PATCH_COMMAND
+  COMMAND ${WARPCTC_PATCH_CUDA_COMMAND}
+  COMMAND ${WARPRNNT_PATCH_ROCM_COMMAND}
+  # BUILD_ALWAYS    1
+  CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+             -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+             -DCMAKE_C_FLAGS=${WARPRNNT_C_FLAGS}
+             -DCMAKE_C_FLAGS_DEBUG=${WARPRNNT_C_FLAGS_DEBUG}
+             -DCMAKE_C_FLAGS_RELEASE=${WARPRNNT_C_FLAGS_RELEASE}
+             -DCMAKE_CXX_FLAGS=${WARPRNNT_CXX_FLAGS}
+             -DCMAKE_CXX_FLAGS_RELEASE=${WARPRNNT_CXX_FLAGS_RELEASE}
+             -DCMAKE_CXX_FLAGS_DEBUG=${WARPRNNT_CXX_FLAGS_DEBUG}
+             -DCMAKE_INSTALL_PREFIX=${WARPRNNT_INSTALL_DIR}
+             -DWITH_GPU=${WITH_GPU}
+             -DWITH_ROCM=${WITH_ROCM}
+             -DWITH_OMP=${USE_OMP}
+             -DNVCC_FLAGS_EXTRA=${NVCC_FLAGS_EXTRA}
+             -DBUILD_SHARED=ON
+             -DBUILD_TESTS=OFF
+             -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+             -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+             ${EXTERNAL_OPTIONAL_ARGS}
+             ${WARPCTC_CCBIN_OPTION}
+  CMAKE_CACHE_ARGS
+    -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+    -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+    -DCMAKE_INSTALL_PREFIX:PATH=${WARPRNNT_INSTALL_DIR}
+  BUILD_BYPRODUCTS ${WARPRNNT_LIBRARIES})
+
+message(STATUS "warp-rnnt library: ${WARPRNNT_LIBRARIES}")
+get_filename_component(WARPRNNT_LIBRARY_PATH ${WARPRNNT_LIBRARIES} DIRECTORY)
+include_directories(${WARPRNNT_INCLUDE_DIR}) # For warprnnt code to include its
+                                             # headers.
+
+add_library(warprnnt INTERFACE)
+# set_property(TARGET warprnnt PROPERTY IMPORTED_LOCATION ${WARPRNNT_LIBRARIES})
+add_dependencies(warprnnt extern_warprnnt)
diff --git a/backends/metax_gpu/kernels/cuda_kernels/warpctc_grad_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/warpctc_grad_kernel_register.cu
index e77a29d12fe..d02f805a671 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/warpctc_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/warpctc_grad_kernel_register.cu
@@ -17,7 +17,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/warpctc_grad_kernel.h"
 
-PD_REGISTER_PLUGIN_KERNEL(warpctc_grad,
+PD_CUSTOM_KERNEL_REGISTER(warpctc_grad,
                           metax_gpu,
                           ALL_LAYOUT,
                           phi::WarpctcGradKernel,
diff --git a/backends/metax_gpu/kernels/cuda_kernels/warpctc_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/warpctc_kernel_register.cu
index 5b343506cad..c488e23fba9 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/warpctc_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/warpctc_kernel_register.cu
@@ -17,5 +17,5 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/warpctc_kernel.h"
 
-PD_REGISTER_PLUGIN_KERNEL(
+PD_CUSTOM_KERNEL_REGISTER(
     warpctc, metax_gpu, ALL_LAYOUT, phi::WarpctcKernel, float, double) {}
diff --git a/backends/metax_gpu/kernels/impl/warpctc_kernel_impl.h b/backends/metax_gpu/kernels/impl/warpctc_kernel_impl.h
index eb64f21c90f..9794ba1b3c0 100644
--- a/backends/metax_gpu/kernels/impl/warpctc_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/warpctc_kernel_impl.h
@@ -204,7 +204,8 @@ class WarpCTCFunctor {
   void init(const Context& dev_ctx, const size_t blank) {
     warpctc_version_ = phi::dynload::get_warpctc_version();
 
-    if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) {
+    if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU ||
+        dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       options_.loc = CTC_GPU;
       options_.stream =
diff --git a/backends/metax_gpu/kernels/impl/warprnnt_kernel_impl.h b/backends/metax_gpu/kernels/impl/warprnnt_kernel_impl.h
index 96e756b16b1..bb4311f5912 100644
--- a/backends/metax_gpu/kernels/impl/warprnnt_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/warprnnt_kernel_impl.h
@@ -138,7 +138,8 @@ class WarpRNNTFunctor {
     // There is no memory allocated operations within warp-rnnt.
     rnntStatus_t status = RNNT_STATUS_UNKNOWN_ERROR;
     bool gpu = false;
-    if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) {
+    if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU ||
+        dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       gpu = true;
 #else
@@ -207,7 +208,8 @@ class WarpRNNTFunctor {
     options_.fastemit_lambda = fastemit_lambda;
     options_.batch_first = true;
 
-    if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) {
+    if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU ||
+        dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       options_.loc = RNNT_GPU;
       options_.stream =
diff --git a/backends/metax_gpu/patch/intrinsics.cuh b/backends/metax_gpu/patch/intrinsics.cuh
new file mode 100644
index 00000000000..71365b6577c
--- /dev/null
+++ b/backends/metax_gpu/patch/intrinsics.cuh
@@ -0,0 +1,459 @@
+/******************************************************************************
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ *
+ * Code and text by Sean Baxter, NVIDIA Research
+ * See http://nvlabs.github.io/moderngpu for repository and documentation.
+ *
+ ******************************************************************************/
+
+#include "devicetypes.cuh"
+
+#pragma once
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+
+namespace mgpu {
+
+MGPU_HOST_DEVICE uint2 ulonglong_as_uint2(uint64 x) {
+	return *reinterpret_cast<uint2*>(&x);
+}
+MGPU_HOST_DEVICE uint64 uint2_as_ulonglong(uint2 x) {
+	return *reinterpret_cast<uint64*>(&x);
+}
+
+MGPU_HOST_DEVICE int2 longlong_as_int2(int64 x) {
+	return *reinterpret_cast<int2*>(&x);
+}
+MGPU_HOST_DEVICE int64 int2_as_longlong(int2 x) {
+	return *reinterpret_cast<int64*>(&x);
+}
+
+MGPU_HOST_DEVICE int2 double_as_int2(double x) {
+	return *reinterpret_cast<int2*>(&x);
+}
+MGPU_HOST_DEVICE double int2_as_double(int2 x) {
+	return *reinterpret_cast<double*>(&x);
+}
+
+MGPU_HOST_DEVICE void SetDoubleX(double& d, int x) {
+	reinterpret_cast<int*>(&d)[0] = x;
+}
+MGPU_HOST_DEVICE int GetDoubleX(double d) {
+	return double_as_int2(d).x;
+}
+MGPU_HOST_DEVICE void SetDoubleY(double& d, int y) {
+	reinterpret_cast<int*>(&d)[1] = y;
+}
+MGPU_HOST_DEVICE int GetDoubleY(double d) {
+	return double_as_int2(d).y;
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// PTX for bfe and bfi
+
+#if __CUDA_ARCH__ >= 200
+
+MGPU_DEVICE uint bfe_ptx(uint x, uint bit, uint numBits) {
+	uint result;
+	asm("bfe.u32 %0, %1, %2, %3;" :
+		"=r"(result) : "r"(x), "r"(bit), "r"(numBits));
+	return result;
+}
+
+
+MGPU_DEVICE uint bfi_ptx(uint x, uint y, uint bit, uint numBits) {
+	uint result;
+	asm("bfi.b32 %0, %1, %2, %3, %4;" :
+		"=r"(result) : "r"(x), "r"(y), "r"(bit), "r"(numBits));
+	return result;
+}
+
+MGPU_DEVICE uint prmt_ptx(uint a, uint b, uint index) {
+	uint ret;
+	asm("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index));
+	return ret;
+}
+
+#endif // __CUDA_ARCH__ >= 200
+
+
+////////////////////////////////////////////////////////////////////////////////
+// shfl_up
+
+__device__ __forceinline__ float shfl_up(float var,
+	unsigned int delta, int width = 32) {
+
+#if __CUDA_ARCH__ >= 300
+#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
+	var = __shfl_up_sync(0xFFFFFFFF, var, delta, width);
+#else
+	var = __shfl_up(var, delta, width);
+#endif
+#endif
+	return var;
+}
+
+__device__ __forceinline__ double shfl_up(double var,
+	unsigned int delta, int width = 32) {
+
+#if __CUDA_ARCH__ >= 300
+	int2 p = mgpu::double_as_int2(var);
+#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
+	p.x = __shfl_up_sync(0xFFFFFFFF, p.x, delta, width);
+	p.y = __shfl_up_sync(0xFFFFFFFF, p.y, delta, width);
+#else
+	p.x = __shfl_up(p.x, delta, width);
+	p.y = __shfl_up(p.y, delta, width);
+#endif
+	var = mgpu::int2_as_double(p);
+#endif
+
+	return var;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// shfl_add
+
+// MGPU_DEVICE int shfl_add(int x, int offset, int width = WARP_SIZE) {
+// 	int result = 0;
+// #if __CUDA_ARCH__ >= 300
+// 	int mask = (WARP_SIZE - width)<< 8;
+// #if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
+// 	asm(
+// 		"{.reg .s32 r0;"
+// 		".reg .pred p;"
+// 		"shfl.up.sync.b32 r0|p, %1, %2, %3, 0xFFFFFFFF;"
+// 		"@p add.s32 r0, r0, %4;"
+// 		"mov.s32 %0, r0; }"
+// 		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
+// #else
+// 	asm(
+// 		"{.reg .s32 r0;"
+// 		".reg .pred p;"
+// 		"shfl.up.b32 r0|p, %1, %2, %3;"
+// 		"@p add.s32 r0, r0, %4;"
+// 		"mov.s32 %0, r0; }"
+// 		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
+// #endif
+// #endif
+// 	return result;
+// }
+
+MGPU_DEVICE int shfl_add(int x, int offset, int width = 32)
+{
+#if __CUDA_ARCH__ >= 300
+    unsigned fullMask = 0xffffffffU;
+    unsigned mask = (width == 32) ? fullMask : ((1U << width) - 1U);
+    int src = 0;
+#if defined(__CUDACC_VER_MAJOR__) && __CUDACC_VER_MAJOR__ >= 9
+    src = __shfl_up_sync(mask, x, offset, width);   // CUDA 9+
+#else
+    src = __shfl_up(x, offset, width);              // CUDA 8-
+#endif
+    int lane = threadIdx.x & 31;
+    return (lane >= offset) ? (src + x) : x;
+#else
+    return x;
+#endif
+}
+
+MGPU_DEVICE int shfl_max(int x, int offset, int width = WARP_SIZE) {
+	int result = 0;
+#if __CUDA_ARCH__ >= 300
+	int mask = (WARP_SIZE - width)<< 8;
+#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
+	asm(
+		"{.reg .s32 r0;"
+		".reg .pred p;"
+		"shfl.up.sync.b32 r0|p, %1, %2, %3, 0xFFFFFFFF;"
+		"@p max.s32 r0, r0, %4;"
+		"mov.s32 %0, r0; }"
+		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
+#else
+	asm(
+		"{.reg .s32 r0;"
+		".reg .pred p;"
+		"shfl.up.b32 r0|p, %1, %2, %3;"
+		"@p max.s32 r0, r0, %4;"
+		"mov.s32 %0, r0; }"
+		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
+#endif
+#endif
+	return result;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// brev, popc, clz, bfe, bfi, prmt
+
+// Reverse the bits in an integer.
+MGPU_HOST_DEVICE uint brev(uint x) {
+#if __CUDA_ARCH__ >= 200
+	uint y = __brev(x);
+#else
+	uint y = 0;
+	for(int i = 0; i < 32; ++i)
+		y |= (1 & (x>> i))<< (31 - i);
+#endif
+	return y;
+}
+
+// Count number of bits in a register.
+MGPU_HOST_DEVICE int popc(uint x) {
+#if __CUDA_ARCH__ >= 200
+	return __popc(x);
+#else
+	int c;
+	for(c = 0; x; ++c)
+		x &= x - 1;
+	return c;
+#endif
+}
+
+// Count leading zeros - start from most significant bit.
+MGPU_HOST_DEVICE int clz(int x) {
+#if __CUDA_ARCH__ >= 200
+	return __clz(x);
+#else
+	for(int i = 31; i >= 0; --i)
+		if((1<< i) & x) return 31 - i;
+	return 32;
+#endif
+}
+
+// Find first set - start from least significant bit. LSB is 1. ffs(0) is 0.
+MGPU_HOST_DEVICE int ffs(int x) {
+#if __CUDA_ARCH__ >= 200
+	return __ffs(x);
+#else
+	for(int i = 0; i < 32; ++i)
+		if((1<< i) & x) return i + 1;
+	return 0;
+#endif
+}
+
+MGPU_HOST_DEVICE uint bfe(uint x, uint bit, uint numBits) {
+#if __CUDA_ARCH__ >= 200
+	return bfe_ptx(x, bit, numBits);
+#else
+	return ((1<< numBits) - 1) & (x>> bit);
+#endif
+}
+
+MGPU_HOST_DEVICE uint bfi(uint x, uint y, uint bit, uint numBits) {
+	uint result;
+#if __CUDA_ARCH__ >= 200
+	result = bfi_ptx(x, y, bit, numBits);
+#else
+	if(bit + numBits > 32) numBits = 32 - bit;
+	uint mask = ((1<< numBits) - 1)<< bit;
+	result = y & ~mask;
+	result |= mask & (x<< bit);
+#endif
+	return result;
+}
+
+MGPU_HOST_DEVICE uint prmt(uint a, uint b, uint index) {
+	uint result;
+#if __CUDA_ARCH__ >= 200
+	result = prmt_ptx(a, b, index);
+#else
+	result = 0;
+	for(int i = 0; i < 4; ++i) {
+		uint sel = 0xf & (index>> (4 * i));
+		uint x = ((7 & sel) > 3) ? b : a;
+		x = 0xff & (x>> (8 * (3 & sel)));
+		if(8 & sel) x = (128 & x) ? 0xff : 0;
+		result |= x<< (8 * i);
+	}
+#endif
+	return result;
+}
+
+// Find log2(x) and optionally round up to the next integer logarithm.
+MGPU_HOST_DEVICE int FindLog2(int x, bool roundUp = false) {
+	int a = 31 - clz(x);
+	if(roundUp) a += !MGPU_IS_POW_2(x);
+	return a;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// vset4
+
+#if __CUDA_ARCH__ >= 300
+
+// Performs four byte-wise comparisons and returns 1 for each byte that
+// satisfies the conditional, and zero otherwise.
+MGPU_DEVICE uint vset4_lt_add_ptx(uint a, uint b, uint c) {
+	uint result;
+	asm("vset4.u32.u32.lt.add %0, %1, %2, %3;" :
+		"=r"(result) : "r"(a), "r"(b), "r"(c));
+	return result;
+}
+MGPU_DEVICE uint vset4_eq_ptx(uint a, uint b) {
+	uint result;
+	asm("vset4.u32.u32.eq %0, %1, %2, %3;" :
+		"=r"(result) : "r"(a), "r"(b), "r"(0));
+	return result;
+}
+#endif // __CUDA_ARCH__ >= 300
+
+MGPU_HOST_DEVICE uint vset4_lt_add(uint a, uint b, uint c) {
+	uint result;
+#if __CUDA_ARCH__ >= 300
+	result = vset4_lt_add_ptx(a, b, c);
+#else
+	result = c;
+	if((0x000000ff & a) < (0x000000ff & b)) result += 0x00000001;
+	if((0x0000ff00 & a) < (0x0000ff00 & b)) result += 0x00000100;
+	if((0x00ff0000 & a) < (0x00ff0000 & b)) result += 0x00010000;
+	if((0xff000000 & a) < (0xff000000 & b)) result += 0x01000000;
+#endif
+	return result;
+}
+
+MGPU_HOST_DEVICE uint vset4_eq(uint a, uint b) {
+	uint result;
+#if __CUDA_ARCH__ >= 300
+	result = vset4_eq_ptx(a, b);
+#else
+	result = 0;
+	if((0x000000ff & a) == (0x000000ff & b)) result = 0x00000001;
+	if((0x0000ff00 & a) == (0x0000ff00 & b)) result += 0x00000100;
+	if((0x00ff0000 & a) == (0x00ff0000 & b)) result += 0x00010000;
+	if((0xff000000 & a) == (0xff000000 & b)) result += 0x01000000;
+#endif
+	return result;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//
+
+MGPU_HOST_DEVICE uint umulhi(uint x, uint y) {
+#if __CUDA_ARCH__ >= 100
+	return __umulhi(x, y);
+#else
+	uint64 product = (uint64)x * y;
+	return (uint)(product>> 32);
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// ldg() function defined for all devices and all types. Only compiles to __ldg
+// intrinsic for __CUDA_ARCH__ >= 320 && __CUDA_ARCH__ < 400 for types supported
+// by __ldg in sm_32_intrinsics.h
+
+template<typename T>
+struct IsLdgType {
+	enum { value = false };
+};
+#define DEFINE_LDG_TYPE(T) \
+	template<> struct IsLdgType<T> { enum { value = true }; };
+
+template<typename T, bool UseLDG = IsLdgType<T>::value>
+struct LdgShim {
+	MGPU_DEVICE static T Ldg(const T* p) {
+		return *p;
+	}
+};
+
+#if __CUDA_ARCH__ >= 320 && __CUDA_ARCH__ < 400
+
+	// List of __ldg-compatible types from sm_32_intrinsics.h.
+	DEFINE_LDG_TYPE(char)
+	DEFINE_LDG_TYPE(short)
+	DEFINE_LDG_TYPE(int)
+	DEFINE_LDG_TYPE(long long)
+	DEFINE_LDG_TYPE(char2)
+	DEFINE_LDG_TYPE(char4)
+	DEFINE_LDG_TYPE(short2)
+	DEFINE_LDG_TYPE(short4)
+	DEFINE_LDG_TYPE(int2)
+	DEFINE_LDG_TYPE(int4)
+	DEFINE_LDG_TYPE(longlong2)
+
+	DEFINE_LDG_TYPE(unsigned char)
+	DEFINE_LDG_TYPE(unsigned short)
+	DEFINE_LDG_TYPE(unsigned int)
+	DEFINE_LDG_TYPE(unsigned long long)
+	DEFINE_LDG_TYPE(uchar2)
+	DEFINE_LDG_TYPE(uchar4)
+	DEFINE_LDG_TYPE(ushort2)
+	DEFINE_LDG_TYPE(ushort4)
+	DEFINE_LDG_TYPE(uint2)
+	DEFINE_LDG_TYPE(uint4)
+	DEFINE_LDG_TYPE(ulonglong2)
+
+	DEFINE_LDG_TYPE(float)
+	DEFINE_LDG_TYPE(double)
+	DEFINE_LDG_TYPE(float2)
+	DEFINE_LDG_TYPE(float4)
+	DEFINE_LDG_TYPE(double2)
+
+	template<typename T> struct LdgShim<T, true> {
+		MGPU_DEVICE static T Ldg(const T* p) {
+			return __ldg(p);
+		}
+	};
+#endif
+
+template<typename T>
+MGPU_DEVICE T ldg(const T* p) {
+	return LdgShim<T>::Ldg(p);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+// Fast division for 31-bit integers.
+// Uses the method in Hacker's Delight (2nd edition) page 228.
+// Evaluates for denom > 1 and x < 2^31.
+struct FastDivide {
+	uint denom;
+	uint coef;
+	uint shift;
+
+	MGPU_HOST_DEVICE uint Divide(uint x) {
+		return umulhi(x, coef)>> shift;
+	}
+	MGPU_HOST_DEVICE uint Modulus(uint x) {
+		return x - Divide(x) * denom;
+	}
+
+	explicit FastDivide(uint denom_) {
+		denom = denom_;
+		uint p = 31 + FindLog2(denom, true);
+		coef = (uint)(((1ull<< p) + denom - 1) / denom);
+		shift = p - 32;
+	}
+};
+
+#pragma GCC diagnostic pop
+
+} // namespace mgpu
diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch
index 8127caee61e..0283a443adb 100755
--- a/backends/metax_gpu/patch/paddle.patch
+++ b/backends/metax_gpu/patch/paddle.patch
@@ -1087,6 +1087,32 @@ index 6f03f76eeb..5fe2c3e7dc 100644
  #include "paddle/phi/kernels/funcs/for_range.h"
  #include "paddle/phi/kernels/funcs/matrix_inverse.h"
  
+diff --git a/paddle/phi/kernels/impl/merged_momentum_impl.h b/paddle/phi/kernels/impl/merged_momentum_impl.h
+index 7b85903776..3f4b298807 100644
+--- a/paddle/phi/kernels/impl/merged_momentum_impl.h
++++ b/paddle/phi/kernels/impl/merged_momentum_impl.h
+@@ -297,7 +297,7 @@ void MergedMomentumInnerCompute(
+                 params_out[idx],
+                 velocities_out[idx]);
+         VLOG(10) << "Launch MergedMomentum cpu kernel.";
+-      } else if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) {
++      } else if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU || dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM) {
+         phi::funcs::ForRange<Context> for_range(
+             static_cast<const Context &>(dev_ctx), params[idx]->numel());
+         const auto grad_type = grads[idx]->dtype();
+diff --git a/paddle/phi/kernels/impl/momentum_kernel_impl.h b/paddle/phi/kernels/impl/momentum_kernel_impl.h
+index de5bcfc30b..eb2a9714f5 100644
+--- a/paddle/phi/kernels/impl/momentum_kernel_impl.h
++++ b/paddle/phi/kernels/impl/momentum_kernel_impl.h
+@@ -457,7 +457,7 @@ void MomentumDenseImpl(const Context& dev_ctx,
+             regularization_coeff,
+             param_out,
+             velocity_out);
+-  } else if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) {
++  } else if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU || dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM) {
+     funcs::ForRange<Context> for_range(dev_ctx, param.numel());
+     const auto grad_type = grad.dtype();
+ #define PADDLE_LAUNCH_DENSE_MOMENTUM_KERNEL(__nesterov, __reg_type)     \
 diff --git a/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h b/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h
 index 4099d8b506..baef2cd643 100644
 --- a/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h

From 8e981985c3b9f2e6bfc3789d92b48fed42abace1 Mon Sep 17 00:00:00 2001
From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Date: Mon, 15 Sep 2025 17:40:04 +0800
Subject: [PATCH 010/121] [Metax] update metax CI (#15)

* [Metax] update metax CI
---
 backends/metax_gpu/tests/CMakeLists.txt       | 100 ++++-
 .../check_diff_metax_legacy_unit_test.sh      | 108 +++++
 .../tests/unit_test/test_abs_metax.py         |  39 ++
 .../tests/unit_test/test_arange_metax.py      | 260 ++++++++++++
 .../test_bfloat16_embedding_metax.py          |  72 ++++
 .../unit_test/test_count_nonzero_api_metax.py |  81 ++++
 .../unit_test/test_gaussian_nll_loss_metax.py | 208 +++++++++
 .../tests/unit_test/test_greater_equal.py     |  44 ++
 ...bate_build_src_rank_and_local_expert_id.py |  62 +++
 ...test_incubate_expand_modality_expert_id.py | 172 ++++++++
 .../test_incubate_fused_rmsnorm_ext_metax.py  |  95 +++++
 .../unit_test/test_incubate_moe_combine.py    | 193 +++++++++
 ...moe_gate_dispatch_partial_nosoftmaxtopk.py | 218 ++++++++++
 ...st_incubate_moe_gate_dispatch_w_permute.py | 207 +++++++++
 ...ncubate_moe_gate_dispatch_w_permute_bwd.py | 175 ++++++++
 .../tests/unit_test/test_layer_norm.py        | 358 ++++++++++++++++
 .../tests/unit_test/test_matmul_op__metax.py  | 395 ++++++++++++++++++
 .../tests/unit_test/test_nonzero_api_metax.py | 220 ++++++++++
 .../tests/unit_test/test_p_norm_op_metax.py   | 215 ++++++++++
 .../tests/unit_test/test_squeeze_op_metax.py  | 125 ++++++
 .../tests/unit_test/test_swiglu_metax.py      | 295 +++++++++++++
 .../tests/unit_test/test_top_p_sampling.py    | 162 +++++++
 .../unit_test/test_unsqueeze_op_metax.py      |  98 +++++
 23 files changed, 3894 insertions(+), 8 deletions(-)
 create mode 100644 backends/metax_gpu/tests/scripts/check_diff_metax_legacy_unit_test.sh
 create mode 100644 backends/metax_gpu/tests/unit_test/test_abs_metax.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_arange_metax.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_bfloat16_embedding_metax.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_count_nonzero_api_metax.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_gaussian_nll_loss_metax.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_greater_equal.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_incubate_build_src_rank_and_local_expert_id.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_incubate_expand_modality_expert_id.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_incubate_fused_rmsnorm_ext_metax.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_incubate_moe_combine.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_bwd.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_layer_norm.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_matmul_op__metax.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_nonzero_api_metax.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_p_norm_op_metax.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_squeeze_op_metax.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_swiglu_metax.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_top_p_sampling.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_unsqueeze_op_metax.py

diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt
index d2e92f209ab..7e549ef4eaa 100755
--- a/backends/metax_gpu/tests/CMakeLists.txt
+++ b/backends/metax_gpu/tests/CMakeLists.txt
@@ -5,22 +5,106 @@ enable_testing()
 
 find_package(Python REQUIRED COMPONENTS Interpreter)
 
-file(GLOB_RECURSE PYTHON_TEST_SCRIPTS "unittest/*.py")
+set(PADDLE_LEGACY_TEST_PATH
+    ${CMAKE_CURRENT_LIST_DIR}/../../../Paddle/test/legacy_test)
+set(METAX_UNIT_TEST_PATH ${CMAKE_CURRENT_LIST_DIR}/unit_test)
+
+file(GLOB_RECURSE PYTHON_TEST_SCRIPTS "${METAX_UNIT_TEST_PATH}/*.py")
 
 list(
   APPEND
   PYTHON_TEST_SCRIPTS
-  ${CMAKE_CURRENT_LIST_DIR}/../../../Paddle/test/legacy_test/test_tril_triu_op.py
-)
+  ${PADDLE_LEGACY_TEST_PATH}/test_accuracy_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_tril_triu_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_where_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_split_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_fill_constant_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_empty_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_sign_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_cast_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_unbind_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_put_along_axis_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_maximum_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_accuracy_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_strided_slice_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_set_value_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_flatten_contiguous_range_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_top_k_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_subtract_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_greater_equal_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_top_k_v2_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_one_hot_v2_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_fill_any_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_reshape_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_index_put_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_bitwise_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_pad_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_uniform_random_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_scatter_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_cast_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_zeros_like_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_compare_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_shape_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_tril_triu_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_slice_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_index_put_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_bincount_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_assign_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_squared_l2_norm_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_fused_bias_act_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_expand_v2_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_adamw_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_gather_nd_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_concat_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_scatter_nd_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_floordiv_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_mul_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_einsum_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_randint_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_numel_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_scale_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_full_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_scatter_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_clip_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_reduce_op.py)
 
 list(
   REMOVE_ITEM
   PYTHON_TEST_SCRIPTS
-  ${CMAKE_CURRENT_LIST_DIR}/unittest/test_cumsum_op_metax.py
-  ${CMAKE_CURRENT_LIST_DIR}/unittest/test_softmax_with_cross_entropy_op_metax.py
-  ${CMAKE_CURRENT_LIST_DIR}/unittest/test_expand_v2_op_metax.py
-  ${CMAKE_CURRENT_LIST_DIR}/unittest/test_tril_triu_op_metax.py
-  ${CMAKE_CURRENT_LIST_DIR}/unittest/test_squared_l2_norm_op_metax.py)
+  ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_expand_v2_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_tril_triu_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_squared_l2_norm_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_scatter_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_einsum_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py)
 
 list(REMOVE_DUPLICATES PYTHON_TEST_SCRIPTS)
 foreach(test_script ${PYTHON_TEST_SCRIPTS})
diff --git a/backends/metax_gpu/tests/scripts/check_diff_metax_legacy_unit_test.sh b/backends/metax_gpu/tests/scripts/check_diff_metax_legacy_unit_test.sh
new file mode 100644
index 00000000000..86bfcb08f86
--- /dev/null
+++ b/backends/metax_gpu/tests/scripts/check_diff_metax_legacy_unit_test.sh
@@ -0,0 +1,108 @@
+#!/bin/bash
+
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+SOURCE_DIR="backends/metax_gpu/tests/unittest"
+SEARCH_DIR="Paddle/test/legacy_test"
+PREFIX_FILE="metax_prefixes.txt"
+UNMATCHED_FILE="unmatched_files.txt"
+EXIST_FILE="existing_files.txt"
+MISS_FILE="missing_files.txt"
+
+# 检查源路径是否存在
+if [ ! -d "$SOURCE_DIR" ]; then
+    echo "错误: 源路径 '$SOURCE_DIR' 不存在或不是一个目录"
+    exit 1
+fi
+
+# 检查搜索路径是否存在
+if [ ! -d "$SEARCH_DIR" ]; then
+    echo "错误: 搜索路径 '$SEARCH_DIR' 不存在或不是一个目录"
+    exit 1
+fi
+
+# 第一步：提取前缀（根据新规则处理）
+echo "第一步：从 '$SOURCE_DIR' 提取文件前缀（按_op/_metax规则）..."
+> "$PREFIX_FILE"      # 清空前缀文件
+> "$UNMATCHED_FILE"   # 清空未匹配文件列表
+
+find "$SOURCE_DIR" -type f -name "*.py" | while read -r file; do
+    filename=$(basename "$file")
+    prefix=""
+
+    # 规则1：如果包含_op关键字，提取_op前的所有字符
+    if [[ "$filename" == *"_op"* ]]; then
+        prefix="${filename%%_op*}"
+        echo "提取前缀（_op规则）: $prefix (来自 $filename)"
+        echo "$prefix" >> "$PREFIX_FILE"
+
+    # 规则2：如果没有_op但有_metax，提取_metax前的所有字符
+    elif [[ "$filename" == *"_metax"* ]]; then
+        prefix="${filename%%_metax*}"
+        echo "提取前缀（_metax规则）: $prefix (来自 $filename)"
+        echo "$prefix" >> "$PREFIX_FILE"
+
+    # 规则3：都不包含，归类到未匹配
+    else
+        echo "未匹配的文件: $filename（不包含_op和_metax）"
+        echo "$filename" >> "$UNMATCHED_FILE"
+    fi
+done
+
+# 检查是否有提取到前缀或未匹配文件
+prefix_count=$(wc -l < "$PREFIX_FILE")
+unmatched_count=$(wc -l < "$UNMATCHED_FILE")
+
+echo "提取完成 - 有效前缀: $prefix_count 个，未匹配文件: $unmatched_count 个"
+
+if [ $prefix_count -eq 0 ] && [ $unmatched_count -eq 0 ]; then
+    echo "警告: 在 '$SOURCE_DIR' 中未找到任何以 '_metax.py' 结尾的文件"
+    exit 0
+fi
+
+# 第二步：在搜索路径中查找同名文件（仅搜索当前目录，不包括子文件夹）
+echo -e "\n第二步：在 '$SEARCH_DIR' 中搜索同名文件（深度为1）..."
+> "$EXIST_FILE"   # 清空存在文件列表
+> "$MISS_FILE"    # 清空缺失文件列表
+
+# 逐个处理每个前缀
+while read -r prefix; do
+    # 跳过空行
+    if [ -z "$prefix" ]; then
+        continue
+    fi
+
+    # 只在搜索路径的直接目录下查找（深度为1）
+    found=$(find "$SEARCH_DIR" -maxdepth 1 -type f -name "${prefix}_op.py" -print -quit)
+
+    if [ -n "$found" ]; then
+        echo "$prefix -> 找到文件: $found"
+        echo "${prefix}_op.py" >> "$EXIST_FILE"
+    else
+        echo "$prefix -> 未找到同名文件"
+        echo "$prefix" >> "$MISS_FILE"
+    fi
+done < "$PREFIX_FILE"
+
+# 输出结果统计
+exist_count=$(wc -l < "$EXIST_FILE")
+miss_count=$(wc -l < "$MISS_FILE")
+
+echo -e "\n处理完成！"
+echo "找到同名文件的前缀数量: $exist_count（已保存到 $EXIST_FILE）"
+echo "未找到同名文件的前缀数量: $miss_count（已保存到 $MISS_FILE）"
+echo "未匹配规则的文件数量: $unmatched_count（已保存到 $UNMATCHED_FILE）"
diff --git a/backends/metax_gpu/tests/unit_test/test_abs_metax.py b/backends/metax_gpu/tests/unit_test/test_abs_metax.py
new file mode 100644
index 00000000000..0dae6822bba
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_abs_metax.py
@@ -0,0 +1,39 @@
+# 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved.
+# # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+import paddle.base.dygraph as dg
+
+
+class TestAbs(unittest.TestCase):
+    def setUp(self):
+        self._dtypes = ["float32"]
+        self._places = [paddle.CustomPlace("metax_gpu", 0)]
+
+    def test_all_positive(self):
+        for dtype in self._dtypes:
+            x = 1 + 10 * np.random.random([13, 3, 3]).astype(dtype)
+            for place in self._places:
+                with dg.guard(place):
+                    y = paddle.abs(paddle.to_tensor(x))
+                    np.testing.assert_allclose(np.abs(x), y.numpy(), rtol=1e-05)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_arange_metax.py b/backends/metax_gpu/tests/unit_test/test_arange_metax.py
new file mode 100644
index 00000000000..89308c33401
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_arange_metax.py
@@ -0,0 +1,260 @@
+# 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved.
+# #   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from op_test import OpTest, convert_float_to_uint16
+
+import paddle
+from paddle.base import core
+from paddle.static import Program, program_guard
+
+
+def arange_wrapper(start, end, step, dtype="float32"):
+    return paddle.arange(start, end, step, dtype)
+
+
+class TestArangeOp(OpTest):
+    def setUp(self):
+        self.op_type = "range"
+        self.init_config()
+        self.inputs = {
+            "Start": np.array([self.case[0]]).astype(self.dtype),
+            "End": np.array([self.case[1]]).astype(self.dtype),
+            "Step": np.array([self.case[2]]).astype(self.dtype),
+        }
+
+        self.outputs = {
+            "Out": np.arange(self.case[0], self.case[1], self.case[2]).astype(
+                self.dtype
+            )
+        }
+
+    def init_config(self):
+        self.dtype = np.float32
+        self.python_api = arange_wrapper
+        self.case = (0, 1, 0.2)
+
+    def test_check_output(self):
+        self.check_output(check_pir=True, check_symbol_infer=False)
+
+
+class TestFloatArangeOp(TestArangeOp):
+    def init_config(self):
+        self.dtype = np.float32
+        self.python_api = paddle.arange
+        self.case = (0, 5, 1)
+
+
+class TestFloat16ArangeOp(TestArangeOp):
+    def init_config(self):
+        self.dtype = np.float16
+        self.python_api = paddle.arange
+        self.case = (0, 5, 1)
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and not support the bfloat16",
+)
+class TestBFloat16ArangeOp(OpTest):
+    def setUp(self):
+        self.op_type = "range"
+        self.init_config()
+        self.inputs = {
+            "Start": convert_float_to_uint16(self.start),
+            "End": convert_float_to_uint16(self.end),
+            "Step": convert_float_to_uint16(self.step),
+        }
+
+        self.outputs = {
+            "Out": convert_float_to_uint16(np.arange(self.start, self.end, self.step))
+        }
+
+    def init_config(self):
+        self.dtype = np.uint16
+        self.python_api = arange_wrapper
+        self.case = (0, 5, 1)
+        self.start = np.array([self.case[0]]).astype(np.float32)
+        self.end = np.array([self.case[1]]).astype(np.float32)
+        self.step = np.array([self.case[2]]).astype(np.float32)
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place, check_pir=True, check_symbol_infer=False)
+
+
+class TestInt32ArangeOp(TestArangeOp):
+    def init_config(self):
+        self.dtype = np.int32
+        self.python_api = paddle.arange
+        self.case = (0, 5, 2)
+
+
+class TestFloat64ArangeOp(TestArangeOp):
+    def init_config(self):
+        self.dtype = np.float64
+        self.python_api = paddle.arange
+        self.case = (10, 1, -2)
+
+
+class TestInt64ArangeOp(TestArangeOp):
+    def init_config(self):
+        self.dtype = np.int64
+        self.python_api = paddle.arange
+        self.case = (-1, -10, -2)
+
+
+class TestZeroSizeArangeOp(TestArangeOp):
+    def init_config(self):
+        self.dtype = np.int32
+        self.python_api = paddle.arange
+        self.case = (0, 0, 1)
+
+
+class TestArangeOpError(unittest.TestCase):
+    def test_static_errors(self):
+        with program_guard(Program(), Program()):
+            paddle.enable_static()
+            self.assertRaises(TypeError, paddle.arange, 10, dtype="int8")
+
+
+class TestArangeAPI(unittest.TestCase):
+    def test_out(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
+            x1 = paddle.arange(0, 5, 1, "float32")
+
+            place = (
+                paddle.CUDAPlace(0)
+                if core.is_compiled_with_cuda()
+                else paddle.CPUPlace()
+            )
+            exe = paddle.static.Executor(place)
+            out = exe.run(fetch_list=[x1])
+
+            expected_data = np.arange(0, 5, 1).astype(np.float32)
+            self.assertEqual((out == expected_data).all(), True)
+            self.assertListEqual(list(x1.shape), [5])
+        paddle.disable_static(place)
+
+
+class TestArangeImperative(unittest.TestCase):
+    def test_out(self):
+        place = (
+            paddle.CUDAPlace(0) if core.is_compiled_with_cuda() else paddle.CPUPlace()
+        )
+        paddle.disable_static(place)
+        x1 = paddle.arange(0, 5, 1)
+        x2 = paddle.tensor.arange(5)
+        x3 = paddle.tensor.creation.arange(5)
+
+        start = paddle.to_tensor(np.array([0], "float32"))
+        end = paddle.to_tensor(np.array([5], "float32"))
+        step = paddle.to_tensor(np.array([1], "float32"))
+        x4 = paddle.arange(start, end, step, "int64")
+
+        expected_data = np.arange(0, 5, 1).astype(np.int64)
+        for x in [x1, x2, x3, x4]:
+            np.testing.assert_array_equal(x.numpy(), expected_data)
+
+        start_float = paddle.to_tensor(np.array([0.5], "float32"))
+        end_float = paddle.to_tensor(np.array([1.5], "float32"))
+        step_float = paddle.to_tensor(np.array([0.5], "float32"))
+        # all [start, end, step] is float
+        x5 = paddle.arange(start_float, end_float, step_float)
+        x5_expected_data = np.arange(0.5, 1.5, 0.5).astype(np.float32)
+        np.testing.assert_array_equal(x5.numpy(), x5_expected_data)
+        self.assertEqual(x5.numpy().dtype, np.float32)
+
+        # [start, end] is float , [step] is int
+        x6 = paddle.arange(start_float, end_float, 1)
+        x6_expected_data = np.arange(0.5, 1.5, 1).astype(np.float32)
+        np.testing.assert_array_equal(x6.numpy(), x6_expected_data)
+        self.assertEqual(x6.numpy().dtype, np.float32)
+
+        # [start] is float , [end] is int
+        x7 = paddle.arange(start_float, 1)
+        x7_expected_data = np.arange(0.5, 1).astype(np.float32)
+        np.testing.assert_array_equal(x7.numpy(), x7_expected_data)
+        self.assertEqual(x7.numpy().dtype, np.float32)
+
+        # [start] is float
+        x8 = paddle.arange(start_float)
+        x8_expected_data = np.arange(0.5).astype(np.float32)
+        np.testing.assert_array_equal(x8.numpy(), x8_expected_data)
+        self.assertEqual(x8.numpy().dtype, np.float32)
+
+        # [start] is int
+        x9 = paddle.arange(1)
+        x9_expected_data = np.arange(1).astype(np.int64)
+        np.testing.assert_array_equal(x9.numpy(), x9_expected_data)
+        self.assertEqual(x9.numpy().dtype, np.int64)
+
+        # [start] is float
+        x10 = paddle.arange(1.0)
+        x10_expected_data = np.arange(1).astype(np.float32)
+        np.testing.assert_array_equal(x10.numpy(), x10_expected_data)
+        self.assertEqual(x10.numpy().dtype, np.float32)
+
+        # [start] is np.int
+        x11 = paddle.arange(np.int64(10))
+        x11_expected_data = np.arange(10).astype(np.int64)
+        np.testing.assert_array_equal(x11.numpy(), x11_expected_data)
+        self.assertEqual(x11.numpy().dtype, np.int64)
+
+        # [start] is a big integer
+        x12 = paddle.arange(
+            start=0,
+            end=-9007199254740994,
+            step=-9007199254740993,
+        )
+
+        # numpy give wrong result here, so we generate 'x12_expected_data' manually
+        # x12_expected_data = np.arange(start=0, stop=-9007199254740994, step=-9007199254740993, dtype=np.int64)
+        x12_expected_data = np.array([0, -9007199254740993])
+
+        np.testing.assert_array_equal(x12.numpy(), x12_expected_data)
+        self.assertEqual(x12.numpy().dtype, np.int64)
+
+        # [start<end step<0]
+        x13 = paddle.arange(start=0, end=10, step=-1)
+
+        x13_expected_data = np.array([])
+        np.testing.assert_array_equal(x13.numpy(), x13_expected_data)
+
+        # [start>end step>0]
+        x14 = paddle.arange(start=10, end=0, step=1)
+
+        x14_expected_data = np.array([])
+        np.testing.assert_array_equal(x14.numpy(), x14_expected_data)
+
+        paddle.enable_static()
+
+
+class TestArangeStatic(unittest.TestCase):
+    def test_infermeta(self):
+        paddle.enable_static()
+        x = paddle.arange(0, 1 + 0.005, 0.005)
+        self.assertEqual(x.shape, [201])
+        paddle.disable_static()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_bfloat16_embedding_metax.py b/backends/metax_gpu/tests/unit_test/test_bfloat16_embedding_metax.py
new file mode 100644
index 00000000000..f575d4eece0
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_bfloat16_embedding_metax.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+import paddle.nn.functional as F
+
+
+class BF16EmbeddingTest(unittest.TestCase):
+    def setUp(self):
+        self.batch_size = 30
+        self.vocab_size = 1024
+        self.hidden_size = 512
+        self.seed = 10
+
+    def run_main(self, dtype):
+        ids, weight, dout = self.gen_random()
+        origin_dtype = weight.dtype
+        weight_cast = weight.astype(dtype)
+        out = F.embedding(ids, weight_cast)
+        dout = dout.astype(out.dtype)
+        dweight = paddle.autograd.grad(out, weight, dout)
+        return (
+            out.astype(origin_dtype).numpy(),
+            dweight[0].astype(origin_dtype).numpy(),
+        )
+
+    def gen_random(self):
+        np.random.seed(self.seed)
+        weight = np.random.random([self.vocab_size, self.hidden_size]).astype("float32")
+        ids = np.random.randint(low=0, high=self.vocab_size, size=[self.batch_size])
+        dout = np.random.random([self.batch_size, self.hidden_size]).astype("float32")
+
+        weight = paddle.to_tensor(weight)
+        weight.stop_gradient = False
+        ids = paddle.to_tensor(ids)
+        dout = paddle.to_tensor(dout)
+        return ids, weight, dout
+
+    def test_main(self):
+
+        ret1 = self.run_main("float32")
+        ret2 = self.run_main("bfloat16")
+        self.assertEqual(len(ret1), len(ret2))
+        for i, (r1, r2) in enumerate(zip(ret1, ret2)):
+            np.testing.assert_allclose(r1, r2, atol=1e-3, rtol=1e-2)
+
+
+class BF16EmbeddingTestOddHiddenSize(BF16EmbeddingTest):
+    def setUp(self):
+        self.batch_size = 30
+        self.vocab_size = 511
+        self.hidden_size = 512
+        self.seed = 20
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_count_nonzero_api_metax.py b/backends/metax_gpu/tests/unit_test/test_count_nonzero_api_metax.py
new file mode 100644
index 00000000000..57a5d0b1c97
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_count_nonzero_api_metax.py
@@ -0,0 +1,81 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+
+np.random.seed(10)
+
+
+class TestCountNonzeroAPI(unittest.TestCase):
+    # test paddle.tensor.math.count_nonzero
+
+    def setUp(self):
+        self.x_shape = [2, 3, 4, 5]
+        self.x = np.random.uniform(-1, 1, self.x_shape).astype(np.float32)
+        self.place = paddle.CustomPlace("metax_gpu", 0)
+
+    def test_api_static(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.static.data("X", self.x_shape)
+            out1 = paddle.count_nonzero(x)
+            out2 = paddle.tensor.count_nonzero(x)
+            out3 = paddle.tensor.math.count_nonzero(x)
+            axis = np.arange(len(self.x_shape)).tolist()
+            out4 = paddle.count_nonzero(x, axis)
+            out5 = paddle.count_nonzero(x, tuple(axis))
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={"X": self.x}, fetch_list=[out1, out2, out3, out4, out5])
+        out_ref = np.count_nonzero(self.x)
+        for out in res:
+            np.testing.assert_allclose(out, out_ref, rtol=1e-05)
+
+    def test_api_dygraph(self):
+        paddle.disable_static(self.place)
+
+        def test_case(x, axis=None, keepdim=False):
+            x_tensor = paddle.to_tensor(x)
+            out = paddle.count_nonzero(x_tensor, axis=axis, keepdim=keepdim)
+            if isinstance(axis, list):
+                axis = tuple(axis)
+                if len(axis) == 0:
+                    axis = None
+
+            out_ref = np.count_nonzero(x, axis, keepdims=keepdim)
+            np.testing.assert_allclose(out.numpy(), out_ref, rtol=1e-05)
+
+        test_case(self.x)
+        test_case(self.x, None)
+        test_case(self.x, -1)
+        test_case(self.x, keepdim=True)
+        test_case(self.x, 2, keepdim=True)
+        test_case(self.x, [0, 2])
+        test_case(self.x, (0, 2))
+        test_case(self.x, (0, 1, 3))
+        test_case(self.x, [0, 1, 2, 3])
+        paddle.enable_static()
+
+    def test_errors(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.static.data("X", [10, 12], "int32")
+            self.assertRaises(ValueError, paddle.count_nonzero, x, axis=10)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_gaussian_nll_loss_metax.py b/backends/metax_gpu/tests/unit_test/test_gaussian_nll_loss_metax.py
new file mode 100644
index 00000000000..73e389324f9
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_gaussian_nll_loss_metax.py
@@ -0,0 +1,208 @@
+#   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+import paddle.nn.functional as F
+from paddle.base import core
+
+np.random.seed(10)
+
+
+def ref_gaussian_nll_loss(
+    input, label, variance, full=False, eps=1e-6, reduction="none"
+):
+    if variance.shape != input.shape:
+        if input.shape[:-1] == variance.shape:
+            variance = np.expand_dims(variance, -1)
+        elif input.shape[:-1] == variance.shape[:-1] and variance.shape[-1] == 1:
+            pass
+        else:
+            raise ValueError("variance is of incorrect size")
+    if reduction != "none" and reduction != "mean" and reduction != "sum":
+        raise ValueError(reduction + " is not valid")
+
+    if np.any(variance < 0):
+        raise ValueError("var has negative entry/entries")
+
+    variance = variance.copy()
+    variance = np.clip(variance, a_min=eps, a_max=None)
+
+    loss = 0.5 * (np.log(variance) + (input - label) ** 2 / variance)
+    if full:
+        loss += 0.5 * np.log(2 * np.pi)
+
+    if reduction == "none":
+        return loss
+    elif reduction == "sum":
+        return [np.sum(loss)]
+    elif reduction == "mean":
+        return [np.mean(loss)]
+
+
+class TestGaussianNLLLossAPI(unittest.TestCase):
+    # test paddle.nn.functional.gaussian_nll_loss, paddle.nn.gaussian_nll_loss
+
+    def setUp(self, type=None):
+        self.shape = [10, 2]
+        if type in ["float16", "float64", "int32", "int64"]:
+            dtype = np.dtype(type)
+            self.input_np = np.random.random(self.shape).astype(dtype)
+            self.label_np = np.random.random(self.shape).astype(dtype)
+            self.variance_np = np.ones(self.shape).astype(dtype)
+        elif type == "broadcast1":
+            self.shape = [10, 2, 3]
+            self.broadcast_shape = [10, 2]
+            self.input_np = np.random.random(self.shape).astype(np.float32)
+            self.label_np = np.random.random(self.shape).astype(np.float32)
+            self.variance_np = np.ones(self.broadcast_shape).astype(np.float32)
+        elif type == "broadcast2":
+            self.shape = [10, 2, 3]
+            self.broadcast_shape = [10, 2, 1]
+            self.input_np = np.random.random(self.shape).astype(np.float32)
+            self.label_np = np.random.random(self.shape).astype(np.float32)
+            self.variance_np = np.ones(self.broadcast_shape).astype(np.float32)
+        else:
+            dtype = np.dtype("float32")
+            self.input_np = np.random.random(self.shape).astype(dtype)
+            self.label_np = np.random.random(self.shape).astype(dtype)
+            self.variance_np = np.ones(self.shape).astype(dtype)
+        if type == "test_err":
+            self.variance_np = -np.ones(self.shape).astype(np.float32)
+
+        self.place = (
+            paddle.CUDAPlace(0) if core.is_compiled_with_cuda() else paddle.CPUPlace()
+        )
+
+    def test_dynamic_case(self, type=None, full=False, reduction="none"):
+        self.setUp(type)
+        paddle.disable_static(self.place)
+
+        input_x = paddle.to_tensor(self.input_np)
+        label = paddle.to_tensor(self.label_np)
+        variance = paddle.to_tensor(self.variance_np)
+        if type in ["test_err", "int32", "int64"]:
+            self.assertRaises(
+                ValueError,
+                paddle.nn.functional.gaussian_nll_loss,
+                input=input_x,
+                label=label,
+                variance=variance,
+            )
+        else:
+            out_ref = ref_gaussian_nll_loss(
+                self.input_np,
+                self.label_np,
+                self.variance_np,
+                full=full,
+                reduction=reduction,
+            )
+            out1 = F.gaussian_nll_loss(
+                input_x, label, variance, full=full, reduction=reduction
+            )
+            gaussian_nll_loss = paddle.nn.GaussianNLLLoss(full, reduction=reduction)
+            out2 = gaussian_nll_loss(input_x, label, variance)
+
+            for r in [out1, out2]:
+                np.allclose(out_ref, r.numpy(), rtol=1e-5, atol=1e-5)
+        paddle.enable_static()
+
+    def test_static_case(self, type=None, full=False, reduction="none"):
+        self.setUp(type)
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            if type in ["int32", "int64", "float64"]:
+                input_x = paddle.static.data("Input_x", self.shape, type)
+                label = paddle.static.data("Label", self.shape, type)
+                variance = paddle.static.data("Variance", self.shape, type)
+            elif type in ["broadcast1", "broadcast2"]:
+                input_x = paddle.static.data("Input_x", self.shape)
+                label = paddle.static.data("Label", self.shape)
+                variance = paddle.static.data("Variance", self.broadcast_shape)
+            else:
+                input_x = paddle.static.data("Input_x", self.shape, "float32")
+                label = paddle.static.data("Label", self.shape, "float32")
+                variance = paddle.static.data("Variance", self.shape, "float32")
+            out1 = F.gaussian_nll_loss(
+                input_x, label, variance, full=full, reduction=reduction
+            )
+            gaussian_nll_loss = paddle.nn.GaussianNLLLoss(full, reduction=reduction)
+            out2 = gaussian_nll_loss(input_x, label, variance)
+            exe = paddle.static.Executor(self.place)
+            if type not in ["test_err", "int32", "int64"]:
+                out_ref = ref_gaussian_nll_loss(
+                    self.input_np,
+                    self.label_np,
+                    self.variance_np,
+                    full=full,
+                    reduction=reduction,
+                )
+                res = exe.run(
+                    feed={
+                        "Input_x": self.input_np,
+                        "Label": self.label_np,
+                        "Variance": self.variance_np,
+                    },
+                    fetch_list=[out1, out2],
+                )
+                for r in res:
+                    np.allclose(out_ref, r, rtol=1e-5, atol=1e-5)
+            else:
+                try:
+                    res = exe.run(
+                        feed={
+                            "Input_x": self.input_np,
+                            "Label": self.label_np,
+                            "Variance": self.variance_np,
+                        },
+                        fetch_list=[out1, out2],
+                    )
+                except ValueError:
+                    pass
+
+    def test_api(self):
+        self.test_dynamic_case()
+        self.test_static_case()
+
+    def test_float64(self):
+        self.test_dynamic_case("float64")
+        self.test_static_case("float64")
+
+    def test_broadcast(self):
+        self.test_dynamic_case("broadcast1")
+        self.test_static_case("broadcast1")
+
+    def test_broadcast_with_same_dim(self):
+        self.test_dynamic_case("broadcast2")
+        self.test_static_case("broadcast2")
+
+    def test_reduction(self):
+        self.test_dynamic_case(full=True, reduction="mean")
+        self.test_dynamic_case(full=True, reduction="sum")
+        self.test_static_case(full=True, reduction="mean")
+
+    def test_error(self):
+        self.test_dynamic_case("test_err")
+        self.test_static_case("test_err")
+
+    def test_int(self):
+        self.test_dynamic_case("int64")
+        self.test_dynamic_case("int32")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_greater_equal.py b/backends/metax_gpu/tests/unit_test/test_greater_equal.py
new file mode 100644
index 00000000000..816d6075099
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_greater_equal.py
@@ -0,0 +1,44 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle import static
+
+
+class Test_Greater_Equal_Op_Fp16(unittest.TestCase):
+    def test_api_fp16(self):
+        paddle.enable_static()
+        with static.program_guard(static.Program(), static.Program()):
+            label = paddle.to_tensor([3, 3], dtype="float16")
+            limit = paddle.to_tensor([3, 2], dtype="float16")
+            out = paddle.greater_equal(x=label, y=limit)
+            # if core.is_compiled_with_cuda():
+            #     place = paddle.CUDAPlace(0)
+            #     exe = static.Executor(place)
+            #     (res,) = exe.run(fetch_list=[out])
+            #     self.assertEqual((res == np.array([True, True])).all(), True)
+            place = paddle.CustomPlace(paddle.device.get_device().split(":")[0], 0)
+            exe = static.Executor(place)
+            (res,) = exe.run(fetch_list=[out])
+            self.assertEqual((res == np.array([True, True])).all(), True)
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_build_src_rank_and_local_expert_id.py b/backends/metax_gpu/tests/unit_test/test_incubate_build_src_rank_and_local_expert_id.py
new file mode 100644
index 00000000000..b4e4282c5ce
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_incubate_build_src_rank_and_local_expert_id.py
@@ -0,0 +1,62 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle.incubate.nn.functional import build_src_rank_and_local_expert_id
+
+logger = logging.getLogger(__name__)
+
+
+class TestFusedCalculateAuxLoss(unittest.TestCase):
+    def test_build_src_rank_and_local_expert_id(self):
+        def orig_func(expert_num_global_list, num_local_experts):
+            send_rank_cpu = np.concatenate(  # TOO SLOW!!! break every thing
+                [
+                    np.full([j], i // num_local_experts, dtype="int32")
+                    for i, j in enumerate(expert_num_global_list)
+                ],
+                0,
+            )
+            local_expert_id_cpu = np.concatenate(
+                [
+                    np.full([j], i % num_local_experts, dtype="int32")
+                    for i, j in enumerate(expert_num_global_list)
+                ],
+                0,
+            )
+            send_rank = paddle.to_tensor(send_rank_cpu)
+            local_expert_id = paddle.to_tensor(local_expert_id_cpu)
+            return send_rank, local_expert_id
+
+        def fused_func(expert_num_global_tensor, expert_num_global, num_local_experts):
+            return build_src_rank_and_local_expert_id(
+                expert_num_global_tensor, expert_num_global, num_local_experts
+            )
+
+        expert_num_global = np.random.randint(0, 512, size=[12 * 8], dtype="int32")
+        expert_num_global_tensor = paddle.to_tensor(expert_num_global, dtype="int64")
+
+        s1, l1 = orig_func(expert_num_global, 12)
+        s2, l2 = fused_func(expert_num_global_tensor, expert_num_global, 12)
+        assert ((s1 - s2) == 0).all(), (s1, s2)
+        assert ((l1 - l2) == 0).all(), (l1, l2)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_expand_modality_expert_id.py b/backends/metax_gpu/tests/unit_test/test_incubate_expand_modality_expert_id.py
new file mode 100644
index 00000000000..2d5670ee739
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_incubate_expand_modality_expert_id.py
@@ -0,0 +1,172 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from collections import namedtuple
+from functools import partial
+
+from ernie_utils.moe_all_gather_layer import MOEAllGatherLayerV2
+
+import paddle
+import paddle.nn.functional as F
+from paddle.incubate.nn.functional import expand_modality_expert_id
+
+
+def fused_gate_logits_process_ref(self, gate_logits_lm, gate_logits_mm, token_type_ids):
+    """process gatelogits"""
+    top_k = self.k
+    num_expert_per_rank_per_modality = (
+        gate_logits_lm.shape[-1] // self.config.moe_world_size
+    )
+
+    @paddle.no_grad()
+    def shift_ids(ids, modality_offset):
+        # 现在认为所以模态的 expert 数都一样
+        rank = ids // num_expert_per_rank_per_modality
+        expert_id_in_rank = ids % num_expert_per_rank_per_modality
+        return (
+            rank * (num_expert_per_rank_per_modality * 2)
+            + expert_id_in_rank
+            + modality_offset * num_expert_per_rank_per_modality
+        )
+
+    if self.group_experts:
+        gate_logits_lm = gate_logits_lm.reshape([gate_logits_lm.shape[0], top_k, -1])
+        prob_lm = self.gate.act(gate_logits_lm)
+        weight_lm, expert_id_lm = prob_lm.topk(k=1, axis=-1)
+        weight_lm = weight_lm.reshape([gate_logits_lm.shape[0], -1])
+        expert_id_lm = expert_id_lm.reshape([gate_logits_lm.shape[0], -1])
+        group_size = gate_logits_lm.shape[-1]
+        scale = paddle.arange(0, top_k * group_size, group_size).unsqueeze(0)
+        expert_id_lm = expert_id_lm + scale
+    else:
+        prob_lm = self.gate.act(gate_logits_lm)
+        weight_lm, expert_id_lm = prob_lm.topk(k=top_k, axis=-1)
+    if token_type_ids is not None:
+        expert_id_lm = shift_ids(expert_id_lm, 0)
+    expert_id_lm.stop_gradient = True
+    lm_weight_and_expert_id = paddle.concat(
+        [weight_lm, expert_id_lm.astype("float32")], -1
+    )
+    if token_type_ids is None:
+        return (
+            lm_weight_and_expert_id,
+            prob_lm.reshape([prob_lm.shape[0], -1]),
+            None,
+        )
+
+    prob_mm = self.gate.act(gate_logits_mm)
+    weight_mm, expert_id_mm = prob_mm.topk(k=top_k, axis=-1)
+
+    expert_id_mm = shift_ids(expert_id_mm, 1)
+    expert_id_mm.stop_gradient = True
+
+    mm_weight_and_expert_id = paddle.concat(
+        [weight_mm, expert_id_mm.astype("float32")], -1
+    )
+
+    token_type_ids_float = token_type_ids[:, None].astype("float32")
+    weight_and_expert = (
+        1 - token_type_ids_float
+    ) * lm_weight_and_expert_id + token_type_ids_float * mm_weight_and_expert_id
+    return weight_and_expert, prob_lm.reshape([prob_lm.shape[0], -1]), prob_mm
+
+
+def test_expand_modality_expert_id():
+    def expand_id_one(
+        expert_id,
+        num_expert_per_modality,
+        k,
+        group_size,
+        modality_offset,
+        is_group_expert,
+    ):
+        orig_shape = expert_id.shape
+        expert_id = expert_id.reshape([-1])
+        xid = paddle.arange(len(expert_id))
+        if is_group_expert:
+            eid = xid % k
+            expert_id += eid * group_size
+
+        rank = expert_id // num_expert_per_modality
+        expert_id_in_rank = expert_id % num_expert_per_modality
+        ret = (
+            rank * (num_expert_per_modality * 2)
+            + expert_id_in_rank
+            + modality_offset * num_expert_per_modality
+        )
+        return ret.reshape(orig_shape)
+
+    S, E, k = 100, 24, 3
+    expert_id_mm = paddle.randint(0, 12, shape=[S, k])
+    num_expert_per_rank_per_modality = E // 2 // 4
+    group_size = E // 2 // k
+    print(f"num_expert_per_rank_per_modality: {num_expert_per_rank_per_modality}")
+    fused = expand_modality_expert_id(
+        expert_id_mm, num_expert_per_rank_per_modality, group_size, 1, True
+    )
+
+    nonfused = expand_id_one(
+        expert_id_mm, num_expert_per_rank_per_modality, k, group_size, 1, True
+    )
+    # num_expert_per_rank_per_modality, group_size
+    assert (fused == nonfused).all().item()
+
+    Config = namedtuple("Config", ["moe_world_size"])
+    Self = namedtuple(
+        "Self",
+        [
+            "config",
+            "k",
+            "gate",
+            "group_experts",
+            "moe_statics",
+            "use_correction_bias",
+        ],
+    )
+    Gate = namedtuple("Gate", ["act"])
+    fake_gate = Gate(act=partial(F.softmax, axis=-1))
+    fake_self = Self(
+        config=Config(
+            moe_world_size=8,
+        ),
+        k=k,
+        gate=fake_gate,
+        moe_statics=None,
+        use_correction_bias=False,
+        group_experts=True,
+    )
+
+    fake_logits = paddle.randn([S, E])
+    fake_logits_mm = paddle.randn([S, E])
+    token_type_ids = paddle.randint(0, 2, shape=[S])
+    w_and_e, prob_lm, prob_mm = MOEAllGatherLayerV2.fused_gate_logits_process_fused(
+        fake_self, fake_logits, fake_logits_mm, None
+    )
+    w_and_e_ref, prob_lm_ref, prob_mm_ref = fused_gate_logits_process_ref(
+        fake_self, fake_logits, fake_logits_mm, None
+    )
+    assert (prob_lm == prob_lm_ref).all().item()
+    assert (w_and_e == w_and_e_ref).all().item()
+    w, e = w_and_e_ref.chunk(2, axis=-1)
+
+
+class Test_expand_modality_expert_id_API(unittest.TestCase):
+    def test_dygraph(self):
+        test_expand_modality_expert_id()
+
+
+if __name__ == "__main__":
+
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_fused_rmsnorm_ext_metax.py b/backends/metax_gpu/tests/unit_test/test_incubate_fused_rmsnorm_ext_metax.py
new file mode 100644
index 00000000000..ca0a780e908
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_incubate_fused_rmsnorm_ext_metax.py
@@ -0,0 +1,95 @@
+#  Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle.incubate.nn.functional import fused_rms_norm_ext
+
+
+class TestFusedRMSNorm(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(2023)
+        np.random.seed(2023)
+
+    def rms_norm_reference(self, x, scale, bias=None, epsilon=1e-5):
+        variance = paddle.mean(paddle.square(x), axis=-1, keepdim=True)
+
+        rms = paddle.sqrt(variance + epsilon)
+        y = x / rms
+        y = y * scale.reshape([1, -1])
+        if bias is not None:
+            y = y + bias.reshape([1, -1])
+        return y, (1.0 / rms).squeeze(-1)
+
+    def test_2d_input(self):
+        rows, cols = 32, 64
+        x = paddle.randn([rows, cols])
+        scale = paddle.randn([cols])
+        y_fused, invvar_fused = fused_rms_norm_ext(x, scale)
+
+        y_ref, invvar_ref = self.rms_norm_reference(x, scale)
+
+        np.testing.assert_allclose(y_fused, y_ref, rtol=1e-5, atol=1e-5)
+        np.testing.assert_allclose(invvar_fused, invvar_ref, rtol=1e-5, atol=1e-5)
+
+    def test_without_bias(self):
+
+        rows, cols = 32, 64
+        x = paddle.randn([rows, cols])
+        scale = paddle.randn([cols])
+
+        y_fused, invvar_fused = fused_rms_norm_ext(x, scale)
+
+        y_ref, invvar_ref = self.rms_norm_reference(x, scale)
+
+        np.testing.assert_allclose(y_fused, y_ref, rtol=1e-5, atol=1e-5)
+        np.testing.assert_allclose(invvar_fused, invvar_ref, rtol=1e-5, atol=1e-5)
+
+    def test_backward(self):
+
+        rows, cols = 16, 32
+        x = paddle.randn([rows, cols], dtype="float32")
+        x.stop_gradient = False
+        scale = paddle.randn([cols], dtype="float32")
+        scale.stop_gradient = False
+
+        y_fused, invvar = fused_rms_norm_ext(x, scale)
+
+        loss = paddle.mean(y_fused)
+        loss.backward()
+
+        x_grad_fused = x.grad.clone()
+        scale_grad_fused = scale.grad.clone()
+
+        x.clear_gradient()
+        scale.clear_gradient()
+
+        y_ref, invvar_ref = self.rms_norm_reference(x, scale)
+        loss_ref = paddle.mean(y_ref)
+        loss_ref.backward()
+
+        x_grad_ref = x.grad
+        scale_grad_ref = scale.grad
+
+        np.testing.assert_allclose(x_grad_fused, x_grad_ref, rtol=1e-4, atol=1e-4)
+        np.testing.assert_allclose(
+            scale_grad_fused, scale_grad_ref, rtol=1e-4, atol=1e-4
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_moe_combine.py b/backends/metax_gpu/tests/unit_test/test_incubate_moe_combine.py
new file mode 100644
index 00000000000..23df4e3457b
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_incubate_moe_combine.py
@@ -0,0 +1,193 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import random
+import unittest
+
+import numpy as np
+from ernie_utils.moe_layer_uneven import GateCombine
+
+import paddle
+import paddle.nn.functional as F
+from paddle.incubate.nn.functional import moe_combine
+
+os.environ["FLAGS_flash_attn_version"] = "v1"
+os.environ["FLAGS_cudnn_deterministic"] = "1"
+os.environ["FLAGS_embedding_deterministic"] = "1"
+
+
+def combining(x, combine_weights, scatter_index, hard_gate=False):
+    """
+    Args:
+        x: Tensor[seq, dim]
+        combine_weights: [seq, k]
+        scatter_index:  ** [seq, k] **
+
+    Returns:
+        y: Tensor[s, dim]
+    """
+    x_gatherd = F.embedding(scatter_index, x)  # [s,k,dim]
+    if hard_gate:
+        return x_gatherd.squeeze(-2)
+    # logger.info(f'combinning: {combine_weights}')
+    y = (combine_weights.unsqueeze(-1) * x_gatherd).sum(1)
+    # y = paddle.matmul(combine_weights.unsqueeze(1), x_gatherd).squeeze()  # [s,1,k] @ [s,k,dim] -> [s,1,dim]
+    return y
+
+
+def baseline_result(x_numpy, combine_weights_numpy, scatter_index_numpy, grad_numpy):
+    """baseline_result"""
+    scatter_index = paddle.to_tensor(scatter_index_numpy)
+    x = paddle.to_tensor(x_numpy).cast("float32")
+    x.stop_gradient = False
+
+    combine_weights = paddle.to_tensor(combine_weights_numpy).cast("float32")
+    combine_weights.stop_gradient = False
+
+    scatter_index = paddle.to_tensor(scatter_index_numpy)
+    grad = paddle.to_tensor(grad_numpy).cast("float32")
+
+    y = combining(x, combine_weights, scatter_index)
+    paddle.autograd.backward([y], [grad], True)
+    return [x.grad, combine_weights.grad, y]
+
+
+def test_moe_combine(x_numpy, combine_weights_numpy, scatter_index_numpy, grad_numpy):
+    """baseline_result"""
+    x = paddle.to_tensor(x_numpy).cast("float32")
+    x.stop_gradient = False
+
+    combine_weights = paddle.to_tensor(combine_weights_numpy).cast("float32")
+    combine_weights.stop_gradient = False
+
+    scatter_index = paddle.to_tensor(scatter_index_numpy).cast("int32")
+    grad = paddle.to_tensor(grad_numpy).cast("float32")
+
+    y = GateCombine.apply(x, combine_weights, scatter_index)
+    paddle.autograd.backward([y], [grad], True)
+    # grad.backward()
+    return [x.grad, combine_weights.grad, y]
+
+
+def gen_test_case(S, K, Dim, capacity_factor, seed=1234):
+    """gen_test_case"""
+    random.seed(seed)
+    np.random.seed(seed)
+    paddle.seed(seed)
+    x_numpy = np.random.rand(int(S * capacity_factor), Dim).astype(np.float32)
+    combine_weights_numpy = np.random.rand(S, K).astype(np.float32)
+    scatter_index_numpy = np.random.permutation(max(x_numpy.shape[0], S * K))[
+        : S * K
+    ].astype("int64")
+    scatter_index_numpy = scatter_index_numpy.reshape([S, K])
+
+    combine_weights_numpy[scatter_index_numpy >= x_numpy.shape[0]] = 0
+    scatter_index_numpy[scatter_index_numpy >= x_numpy.shape[0]] = 0
+    grad_numpy = np.random.randn(S, Dim).astype(np.float32)
+    return x_numpy, combine_weights_numpy, scatter_index_numpy, grad_numpy
+
+
+def testing(test_case):
+    """testing"""
+    [bl_x_grad, bl_combine_weights_grad, bl_y] = baseline_result(*test_case)
+    [fused_x_grad, fused_combine_weights_grad, fused_y] = test_moe_combine(*test_case)
+    np.testing.assert_allclose(
+        fused_y.astype("float32").numpy(),
+        bl_y.astype("float32").numpy(),
+        err_msg="fwd precision not pass",
+        rtol=1e-6,
+    )
+    np.testing.assert_allclose(
+        fused_x_grad.astype("float32").numpy(),
+        bl_x_grad.astype("float32").numpy(),
+        rtol=1e-6,
+        err_msg="bwd grad precision not pass",
+    )
+    np.testing.assert_allclose(
+        fused_combine_weights_grad.astype("float32").numpy(),
+        bl_combine_weights_grad.astype("float32").numpy(),
+        rtol=1e-6,
+    )
+
+
+class TestFused(unittest.TestCase):
+    @unittest.skipIf(moe_combine is None, "test_moe_combine not installed")
+    def test_cap_lt_2(
+        self,
+    ):
+        """
+        测试精度对齐的功能
+
+        Args:
+            无参，没有任何参数。
+
+        Returns:
+            NoneType：测试通过时返回None；测试失败时抛出异常。
+
+        """
+        testing(gen_test_case(S=1024, K=2, Dim=4096, capacity_factor=1.8))
+
+    @unittest.skipIf(moe_combine is None, "test_moe_combine not installed")
+    def test_cap_eq_2(
+        self,
+    ):
+        """
+        测试精度对齐的功能
+
+        Args:
+            无参，没有任何参数。
+
+        Returns:
+            NoneType：测试通过时返回None；测试失败时抛出异常。
+
+        """
+        testing(gen_test_case(S=1024, K=2, Dim=4096, capacity_factor=2))
+
+    @unittest.skipIf(moe_combine is None, "test_moe_combine not installed")
+    def test_cap_gt_2(
+        self,
+    ):
+        """
+        测试精度对齐的功能
+
+        Args:
+            无参，没有任何参数。
+
+        Returns:
+            NoneType：测试通过时返回None；测试失败时抛出异常。
+
+        """
+        testing(gen_test_case(S=1024, K=2, Dim=4096, capacity_factor=2.2))
+
+    @unittest.skipIf(moe_combine is None, "test_moe_combine not installed")
+    def test_k_gt_2(
+        self,
+    ):
+        """
+        测试精度对齐的功能
+
+        Args:
+            无参，没有任何参数。
+
+        Returns:
+            NoneType：测试通过时返回None；测试失败时抛出异常。
+
+        """
+        testing(gen_test_case(S=1024, K=8, Dim=4096, capacity_factor=2))
+
+
+if __name__ == "__main__":
+
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk.py b/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk.py
new file mode 100644
index 00000000000..4c209970629
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk.py
@@ -0,0 +1,218 @@
+# ruff: noqa: C419
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle
+from paddle.incubate.nn.functional import (
+    moe_gate_dispatch,
+    moe_gate_dispatch_partial_nosoftmaxtopk,
+)
+
+
+def test_moe_dispatch_partial_nosoftmaxtopk_nonepad_op():
+
+    s, d, e = 4, 100, 8
+    k, cap = 4, 3
+    local_expert_num = 2
+
+    # x = paddle.randn([s, d])
+    # gate_logits = paddle.randn([s, e])
+    x = paddle.arange(1, s + 1).unsqueeze(-1).expand([s, d]).astype("bfloat16")
+    x_ = x.clone().detach()
+
+    t = (
+        (paddle.arange(0, e)).unsqueeze(0) + paddle.arange(0, -s, -1).unsqueeze(-1)
+    ) % e
+    gate_logits = (1 / (t + 1)).astype("float32")
+    # gate_logits = F.softmax(paddle.randn([s,e]),-1).astype('float32')
+    gate_logits_ = gate_logits.clone().detach()
+    s = x.shape[0]
+    d = x.shape[1]
+    e = gate_logits.shape[1]
+    x.stop_gradient = False
+    x_.stop_gradient = False
+    gate_logits.stop_gradient = False
+    gate_logits_.stop_gradient = False
+    print(f"gate_logits:{gate_logits}")
+
+    def check_ascend(index_rev, chunks):
+        for idx in index_rev.split(chunks.tolist()):
+            if len(idx) > 2:
+                assert (paddle.diff(idx) >= 0).all(), (index_rev,)
+
+    ys, comm, scatter_idx = [], [], []
+    for ilocal_expert in range(0, e, local_expert_num):
+        combine_weihgts, expert_id = gate_logits.topk(k=k, axis=1)
+        (
+            y,
+            combine_weihgts,
+            scatter_index,
+            scatter_index_rev,
+            expert_offset,
+            expert_num_local,
+        ) = moe_gate_dispatch_partial_nosoftmaxtopk(
+            x,
+            combine_weihgts,
+            expert_id.astype("int32"),
+            k=k,
+            capacity=cap,
+            num_experts=gate_logits.shape[-1],
+            use_pad=False,
+            expert_start_index=ilocal_expert,
+            expert_end_index=ilocal_expert + local_expert_num,  # k  # cap
+            reverse_token_drop=False,
+        )
+        check_ascend(scatter_index_rev, expert_num_local)
+        print(f"y:{y.mean(-1)}")
+        print(f"combine_weihgts:{combine_weihgts}")
+        print(f"expert_num_local:{expert_num_local}")
+        print(f"scatter_index:{scatter_index.transpose([1,0])}")
+        print(f"scatter_index_rev:{scatter_index_rev}")
+
+        ys.append(y)
+        comm.append(combine_weihgts)
+        scatter_idx.append(scatter_index)
+
+    comm_sum = paddle.stack(comm).sum(0)
+    ys_sum = paddle.concat(ys)
+
+    (
+        y_,
+        combine_weihgts_,
+        scatter_index_,
+        expert_offset_,
+        expert_id_,
+    ) = moe_gate_dispatch(
+        x_,
+        gate_logits_,
+        None,
+        k=k,
+        capacity=cap,
+        use_pad=True,  # k  # cap
+    )
+    valid_y = y_.sum(-1) > 0.0
+    y_2 = y_[valid_y].squeeze()
+
+    print(
+        f"""
+    y: {ys_sum.astype("float32").mean(axis=-1)}
+    y_: {y_2.astype("float32").mean(axis=-1)}
+
+    comm-weight: {comm_sum}
+    comm-weight_: {combine_weihgts_}
+
+    expert_id:{expert_id}
+    scatter_index:{scatter_index}
+    scatter_index_rev: {scatter_index_rev}
+    expert_num_global:{expert_offset}
+    expert_num_local:{expert_num_local}
+    """
+    )
+
+    print("<<< begin backward>>>")
+
+    assert combine_weihgts_.shape == combine_weihgts.shape, (
+        combine_weihgts_.shape,
+        combine_weihgts.shape,
+    )
+
+    dysum, dcombine_weights_sum = paddle.ones_like(ys_sum), paddle.randn(
+        comm_sum.shape
+    ).astype(comm_sum.dtype)
+    dy_, dcombine_weights_ = paddle.ones_like(y_), paddle.ones_like(combine_weihgts_)
+    dy_[~valid_y] = 0
+
+    y_shapes = [len(y) for y in ys]
+    for dyy, yy, commm in zip(
+        paddle.split(dysum, y_shapes),
+        ys,
+        comm,
+    ):
+        print(f"dyy:{dyy.shape}, {yy.shape} {commm.shape}")
+        paddle.autograd.backward([yy, commm], [dyy, dcombine_weights_sum])
+        print(x.grad.astype("float32").mean(axis=-1))
+    print(f"bwd original:{y_.shape} {dy_.shape}")
+    paddle.autograd.backward([y_, combine_weihgts_], [dy_, dcombine_weights_])
+
+    print(x_.grad.astype("float32").mean(axis=-1))
+
+    print(
+        f"""
+    x: {x.grad.astype('float32').mean(axis=-1)}
+    x_: {x_.grad.astype('float32').mean(axis=-1)}
+    """
+    )
+
+
+def test_moe_ops_partial_nosoftmaxtopk_w_reverse_token_drop():
+
+    S, E, D = 3, 4, 3
+    k = 2
+    capacity = 2
+    x = (paddle.arange(S) + 1).unsqueeze(-1).expand([S, D]).astype("bfloat16")
+    cw = paddle.randn([S, k])
+    eid = paddle.to_tensor([[0, 1], [0, 1], [0, 2]], dtype="int32")  # 1  # 2  # 3
+    (
+        y,
+        cw_,
+        idx,
+        idx_rev,
+        num_ex_global,
+        num_ex_local,
+    ) = moe_gate_dispatch_partial_nosoftmaxtopk(
+        x, cw, eid, k, capacity, E, False, 0, 2, reverse_token_drop=True
+    )
+
+    y0, y1 = y.split([i for i in num_ex_local.tolist() if i > 0])
+    assert y0[:, 0].astype("int32").tolist() == [2, 3], y0[:, 0]
+    assert y1[:, 0].astype("int32").tolist() == [1, 2]
+
+
+def test_moe_ops_partial_nosoftmax_topk_empty_output():
+
+    S, E, D = 3, 4, 3
+    k = 2
+    capacity = 2
+    x = (paddle.arange(S) + 1).unsqueeze(-1).expand([S, D]).astype("bfloat16")
+    cw = paddle.randn([S, k])
+    paddle.device.synchronize()
+    eid = paddle.to_tensor([[0, 1], [0, 1], [0, 2]], dtype="int32")  # 1  # 2  # 3
+    (
+        y,
+        cw_,
+        idx,
+        idx_rev,
+        num_ex_global,
+        num_ex_local,
+    ) = moe_gate_dispatch_partial_nosoftmaxtopk(
+        x, cw, eid, k, capacity, E, False, 3, 4, reverse_token_drop=True
+    )
+    assert all([i == 0 for i in num_ex_local.tolist()]), num_ex_local
+
+
+class TestAddition(unittest.TestCase):
+    def test_moe_dispatch_partial_nosoftmaxtopk_nonepad_op(self):
+        test_moe_dispatch_partial_nosoftmaxtopk_nonepad_op()
+
+    def test_moe_ops_partial_nosoftmaxtopk_w_reverse_token_drop(self):
+        test_moe_ops_partial_nosoftmaxtopk_w_reverse_token_drop()
+
+    def test_moe_ops_partial_nosoftmax_topk_empty_output(self):
+        test_moe_ops_partial_nosoftmax_topk_empty_output()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute.py b/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute.py
new file mode 100644
index 00000000000..19752abd904
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute.py
@@ -0,0 +1,207 @@
+# !/usr/bin/env python3
+
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+
+import numpy as np
+
+import paddle
+import paddle.nn.functional as F
+from paddle.incubate.nn.functional import (
+    moe_gate_dispatch,
+    moe_gate_dispatch_permute,
+)
+
+os.environ["FLAGS_flash_attn_version"] = "v1"
+os.environ["FLAGS_cudnn_deterministic"] = "1"
+os.environ["FLAGS_embedding_deterministic"] = "1"
+
+
+class TestFused(unittest.TestCase):
+    def test_moe_ops(self):
+        """
+        test `moe-ops` w/ bias
+        """
+        S, E, D = 8192, 64, 128
+        k = 4
+        x = paddle.randn([S, D], dtype="bfloat16")
+        gate_logits = paddle.randn([S, E], dtype="float32")
+        x_ = x.clone()
+        gate_logits_ = gate_logits.clone()
+        x.stop_gradient = True
+        x_.stop_gradient = True
+        gate_logits.stop_gradient = True
+        gate_logits_.stop_gradient = True
+        bias = paddle.zeros([E], dtype="float32")
+        cap = 512
+
+        (
+            y,
+            combine_weihgts,
+            scatter_index,
+            expert_offset_,
+            expert_id_,
+        ) = moe_gate_dispatch(
+            x,
+            gate_logits,
+            None,
+            k=k,
+            capacity=cap,
+            use_pad=True,  # k  # cap
+        )
+
+        (
+            y_,
+            combine_weihgts_,
+            scatter_index_,
+            expert_offset_,
+            expert_id_,
+        ) = moe_gate_dispatch(
+            x_,
+            gate_logits_,
+            bias + 1,  # +1也不会破坏路由结果
+            k=k,
+            capacity=cap,
+            use_pad=True,  # k  # cap
+        )
+        bias_unbalanced = bias.clone()
+        bias_unbalanced[0] += 1
+        (
+            y__,
+            combine_weihgts__,
+            scatter_index__,
+            expert_offset__,
+            expert_id__,
+        ) = moe_gate_dispatch(
+            x_,
+            gate_logits_,
+            bias_unbalanced,
+            k=k,
+            capacity=cap,
+            use_pad=True,  # k  # cap
+        )
+        np.testing.assert_equal(
+            y.astype("float32").numpy(),
+            y_.astype("float32").numpy(),
+            err_msg="incubate w bias not match",
+        )
+        # bias 不影响 prob 概率
+        np.testing.assert_equal(
+            combine_weihgts.astype("float32").numpy(),
+            combine_weihgts_.astype("float32").numpy(),
+            err_msg="incubate w bias not match",
+        )
+        np.testing.assert_(
+            (y.astype("float32").numpy(0) != y__.astype("float32").numpy()).any(),
+        )
+
+
+class TestDispatchPermute(unittest.TestCase):
+    def get_detached_input(self, input, prob):
+        ret_input = input.detach()
+        ret_prob = prob.detach()
+        ret_input.stop_gradient = input.stop_gradient
+        ret_prob.stop_gradient = prob.stop_gradient
+        return ret_input, ret_prob
+
+    def get_stage_input_list(self, x, world_size, stage):
+        print(world_size, stage, x.shape)
+        x = x.reshape([world_size * stage, -1, x.shape[-1]])
+        stage_input_list = []
+        x_list = paddle.split(x, num_or_sections=(world_size * stage), axis=0)
+        for stage_id in range(stage):
+            stage_input_list.append(
+                paddle.unsqueeze(paddle.concat(x_list[stage_id::stage], axis=0), axis=0)
+            )
+        stage_input_list = paddle.concat(stage_input_list, axis=0)
+        return stage_input_list
+
+    def test_moe_permute_ops(self):
+        paddle.seed(2025)
+
+        test_cases = [
+            (8, 4, 2),
+            (64, 16, 32),
+            (1024, 1024, 1024),
+            (8, 2, 4),
+            (4096, 4096, 4096),
+        ]
+        cases = list(zip(*test_cases))
+        for _, case in enumerate(cases):
+            world_size, num_experts, num_tokens, k, hidden_size = case
+            capacity = num_tokens // k
+            stages = num_experts // world_size
+
+            input = paddle.randn([num_tokens, hidden_size], dtype="float32")
+            prob_logits = paddle.randn([num_tokens, num_experts], dtype="float32")
+            prob = F.softmax(prob_logits, axis=-1)
+            input.stop_gradient = False
+            prob.stop_gradient = False
+
+            compat_args = (None,)
+
+            ref_input, ref_prob = self.get_detached_input(input, prob)
+            (
+                ref_dispatched_input,
+                ref_combine_weights_unnorm,
+                ref_scatter_index,
+                ref_dispatch_mask,
+                _,
+            ) = moe_gate_dispatch(
+                ref_input,
+                ref_prob,
+                *compat_args,
+                k=k,
+                capacity=capacity,
+                use_pad=True,
+            )
+
+            ref_stage_input_list = self.get_stage_input_list(
+                ref_dispatched_input, world_size, stages
+            )
+
+            test_input, test_prob = self.get_detached_input(input, prob)
+            (
+                test_dispatched_input,
+                test_combine_weights_unnorm,
+                test_scatter_index,
+                test_dispatch_mask,
+                _,
+            ) = moe_gate_dispatch_permute(
+                test_input,
+                test_prob,
+                *compat_args,
+                k=k,
+                capacity=capacity,
+                world_size=world_size,
+            )
+
+            np.testing.assert_equal(
+                test_dispatched_input.shape,
+                ref_stage_input_list.shape,
+                err_msg="moe_permute_ops not match",
+            )
+            np.testing.assert_equal(
+                test_dispatched_input._md5sum(),
+                ref_stage_input_list._md5sum(),
+                err_msg="moe_permute_ops not match",
+            )
+
+
+if __name__ == "__main__":
+
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_bwd.py b/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_bwd.py
new file mode 100644
index 00000000000..14991becc47
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_bwd.py
@@ -0,0 +1,175 @@
+# !/usr/bin/env python3
+
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+import paddle.nn.functional as F
+from paddle.incubate.nn.functional import (
+    moe_gate_dispatch,
+    moe_gate_dispatch_permute,
+)
+
+batch_size = 4
+hidden_size = 2
+k = 16
+capacity = 2
+num_experts = 16
+
+world_size = 2
+
+
+class TestLayer(paddle.nn.Layer):
+    def forward(self, x, gate_prob, k, capacity):
+        y, combine_weights, scatter_index, expert_offset, expert_id = moe_gate_dispatch(
+            x, gate_prob, None, k, capacity, True
+        )
+        return y, combine_weights, scatter_index, expert_offset, expert_id
+
+
+class TestLayerPermute(paddle.nn.Layer):
+    def forward(self, x, gate_prob, k, capacity):
+        (
+            y,
+            combine_weights,
+            scatter_index,
+            expert_offset,
+            expert_id,
+        ) = moe_gate_dispatch_permute(
+            x, gate_prob, None, k, capacity, world_size=world_size
+        )
+        return y, combine_weights, scatter_index, expert_offset, expert_id
+
+
+def check_backward_correctness(layer_cls):
+    paddle.seed(1024)
+
+    dtype = "bfloat16"
+    layer = layer_cls()
+    input = paddle.randn([batch_size, hidden_size])
+
+    gate_weight = paddle.randn([hidden_size, num_experts])
+    logits = paddle.matmul(input, gate_weight)
+    gate_prob = F.softmax(logits, axis=-1)
+    print(f"gate_prob: {gate_prob}")
+
+    input = paddle.cast(input, "bfloat16")
+    input.stop_gradient = False
+    gate_prob.stop_gradient = False
+
+    output, combine_weights, scatter_index, expert_offset, expert_id = layer(
+        input, gate_prob, k, capacity
+    )
+
+    print(f"output: {output}")
+    print(f"combine_weights: {combine_weights}")
+    print(f"scatter_index: {scatter_index}")
+    print(f"expert_offset: {expert_offset}")
+    print(f"expert_id: {expert_id}")
+
+    # output_g = paddle.randn(output.shape).astype(output.dtype)
+    # combine_weights_g = paddle.randn(combine_weights.shape).astype(combine_weights.dtype)
+    output_g = paddle.ones_like(output)
+    combine_weights_g = paddle.ones_like(combine_weights)
+    print(f"output_g: {output_g}")
+    print(f"combine_weights_g: {combine_weights_g}")
+
+    paddle.autograd.backward(
+        tensors=[output, combine_weights],
+        grad_tensors=[output_g, combine_weights_g],
+    )
+    # 数值估算
+    epsilon = 0.005
+    input_numpy = input.detach().astype("float32").numpy()
+    num_grad = paddle.zeros_like(input)
+    flattened = num_grad.reshape([-1])
+
+    for i in range(input.numel()):
+        input_pos = input_numpy.copy()
+        input_neg = input_numpy.copy()
+        input_pos.flat[i] += epsilon
+        input_neg.flat[i] -= epsilon
+
+        output_pos, _, _, _, _ = layer(
+            paddle.to_tensor(input_pos), gate_prob, k, capacity
+        )
+        output_neg, _, _, _, _ = layer(
+            paddle.to_tensor(input_neg), gate_prob, k, capacity
+        )
+
+        """
+        flattened[i] = (output_pos.astype("float32").numpy() - output_neg.astype("float32").numpy()).sum() / (
+            2 * epsilon
+        )
+        """
+        grad_value = (output_pos - output_neg).sum() / (2 * epsilon)
+        flattened[i] = grad_value
+
+    flattened = flattened.reshape(input.shape)
+
+    print(f"input gradient: {input.grad}")
+    print(f"numerical gradient: {flattened}")
+    np.testing.assert_allclose(
+        input.grad.astype("float32").numpy(),
+        flattened.astype("float32").numpy(),
+        rtol=1e-5,
+        atol=0,
+    )
+
+    # 数值估算 gate_prob
+    epsilon = 0.0005
+    gate_prob_numpy = gate_prob.detach().astype("float32").numpy()
+    num_grad = paddle.zeros_like(gate_prob)
+    flattened = num_grad.reshape([-1])
+
+    for i in range(gate_prob.numel()):
+        input_pos = gate_prob_numpy.copy()
+        input_neg = gate_prob_numpy.copy()
+        input_pos.flat[i] += epsilon
+        input_neg.flat[i] -= epsilon
+
+        _, output_pos, _, _, _ = layer(input, paddle.to_tensor(input_pos), k, capacity)
+        _, output_neg, _, _, _ = layer(input, paddle.to_tensor(input_neg), k, capacity)
+
+        grad_value = paddle.to_tensor(
+            (output_pos.numpy() - output_neg.numpy()).sum() / (2 * epsilon)
+        )
+        flattened[i] = grad_value
+
+    flattened = flattened.reshape(gate_prob.shape)
+
+    print(f"gate_prob gradient: {gate_prob.grad}")
+    print(f"numerical gradient: {flattened}")
+    np.testing.assert_allclose(
+        gate_prob.grad.astype("float32").numpy(),
+        flattened.astype("float32").numpy(),
+        rtol=1e-4,
+        atol=0,
+    )
+
+
+class TestFused(unittest.TestCase):
+    def test_moe_backward(self):
+        check_backward_correctness(TestLayer)
+
+    def test_moe_permute_backward(self):
+        check_backward_correctness(TestLayerPermute)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_layer_norm.py b/backends/metax_gpu/tests/unit_test/test_layer_norm.py
new file mode 100644
index 00000000000..dbeaee31f6c
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_layer_norm.py
@@ -0,0 +1,358 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+
+from operator import mul
+import paddle.base.core as core
+import paddle.nn.functional as F
+import paddle.base as base
+from functools import reduce
+from op_test import _set_use_system_allocator
+from paddle.static.amp.fp16_utils import (
+    _keep_layer_norm_scale_bias_to_fp32,
+)
+from paddle.pir_utils import OldIrGuard
+
+paddle.enable_static()
+
+np.random.random(123)
+
+_set_use_system_allocator(True)
+
+
+def _reference_layer_norm_naive(x, scale, beta, epsilon, begin_norm_axis=1):
+    x_shape = x.shape
+    N = reduce(mul, x_shape[0:begin_norm_axis], 1)
+    D = reduce(mul, x_shape[begin_norm_axis : len(x_shape)], 1)
+    x.shape = [N, D]
+
+    mean = np.mean(x, axis=1)
+    var = np.var(x, axis=1) + epsilon
+    output = np.divide((x - mean.reshape([N, 1])), (np.sqrt(var)).reshape([N, 1]))
+    if scale is not None:
+        output = scale.reshape([1, D]) * output
+    if beta is not None:
+        output = output + beta.reshape([1, D])
+
+    x.shape, output.shape = x_shape, x_shape
+    return output, mean, var
+
+
+def _reference_layer_norm_grad(x, grad_y, scale, bias, mean, var, begin_norm_axis=1):
+    x_shape = x.shape
+    N = reduce(mul, x_shape[0:begin_norm_axis], 1)
+    D = reduce(mul, x_shape[begin_norm_axis : len(x_shape)], 1)
+
+    if scale is not None:
+        scale_shape = scale.shape
+        scale.shape = [1, D]
+    x.shape, grad_y.shape = [N, D], [N, D]
+    var.shape, mean.shape = [N, 1], [N, 1]
+
+    # d_bias
+    if bias is not None:
+        d_bias = np.sum(grad_y, axis=0).reshape([1, D])
+    else:
+        d_bias = None
+    # d_scale
+    if scale is not None:
+        d_scale = np.sum(((x - mean) * np.sqrt(1 / var)) * grad_y, axis=0).reshape(
+            [1, D]
+        )
+    else:
+        d_scale = None
+    # dx
+    if scale is not None:
+        dx_end = scale * np.sqrt(1.0 / var) * grad_y
+        d_mean_0 = np.sum(-np.sqrt(1.0 / var) * grad_y * scale, axis=1).reshape(
+            [N, 1]
+        )  # the second part equals to zero.
+        d_mean = 1.0 / D * d_mean_0
+        d_std = np.sum(-(1.0 / var) * (x - mean) * grad_y * scale, axis=1).reshape(
+            [N, 1]
+        ) * (1.0 / D * np.sqrt(1.0 / var).reshape([N, 1]) * (x - mean))
+    else:
+        dx_end = 1.0 * np.sqrt(1.0 / var) * grad_y
+        d_mean_0 = np.sum(-np.sqrt(1.0 / var) * grad_y * 1.0, axis=1).reshape(
+            [N, 1]
+        )  # the second part equals to zero.
+        d_mean = 1.0 / D * d_mean_0
+        d_std = np.sum(-(1.0 / var) * (x - mean) * grad_y * 1.0, axis=1).reshape(
+            [N, 1]
+        ) * (1.0 / D * np.sqrt(1.0 / var).reshape([N, 1]) * (x - mean))
+
+    grad_x = dx_end + d_mean + d_std
+
+    grad_x.shape, x.shape, grad_y.shape = x_shape, x_shape, x_shape
+    var.shape, mean.shape = [N], [N]
+
+    if scale is not None:
+        scale.shape = scale_shape
+    return grad_x, d_scale, d_bias
+
+
+class TestLayerNormOp(unittest.TestCase):
+    def setUp(self):
+        self.init_dtype()
+        self.place = paddle.CustomPlace("metax_gpu", 0)
+        self.__class__.use_custom_device = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def __assert_close(self, tensor, np_array, msg, atol=1e-4):
+        np.testing.assert_allclose(
+            np.array(tensor), np_array, rtol=1e-4, atol=atol, err_msg=msg
+        )
+
+    def check_forward_backward(
+        self,
+        shape,
+        begin_norm_axis,
+        has_scale=True,
+        has_bias=True,
+        y_grad_scale=1.0,
+        use_mkldnn=False,
+    ):
+        def test_with_place(place, shape, begin_norm_axis, use_mkldnn=use_mkldnn):
+            # attr
+            epsilon = 0.00001
+            x_shape = shape
+            D = reduce(mul, x_shape[begin_norm_axis : len(x_shape)], 1)
+            scale_shape = [D]
+
+            np.random.seed(123)
+            x = np.random.random_sample(x_shape).astype(self.dtype)
+            scale = (
+                np.random.random_sample(scale_shape).astype(np.float32)
+                if has_scale
+                else None
+            )
+            bias = (
+                np.random.random_sample(scale_shape).astype(np.float32)
+                if has_bias
+                else None
+            )
+            y_grad = (np.random.random_sample(x_shape) * y_grad_scale).astype(
+                self.dtype
+            )
+
+            # reference forward & backward
+            y, mean, variance = _reference_layer_norm_naive(
+                x, scale, bias, epsilon, begin_norm_axis
+            )
+            x_grad, scale_grad, bias_grad = _reference_layer_norm_grad(
+                x, y_grad, scale, bias, mean, variance, begin_norm_axis
+            )
+            mean.shape = x_shape[0:begin_norm_axis]
+            variance.shape = x_shape[0:begin_norm_axis]
+
+            var_dict = locals()
+            var_dict["y@GRAD"] = y_grad
+            var_names = ["x", "mean", "variance", "y", "y@GRAD"]
+            if has_scale:
+                var_names += ["scale"]
+            if has_bias:
+                var_names += ["bias"]
+            ground_truth = {name: var_dict[name] for name in var_names}
+
+            with OldIrGuard():
+                program = base.Program()
+                old_program_guard = base.program_guard
+            with old_program_guard(program):
+                block = program.global_block()
+                for name in ground_truth:
+                    block.create_var(
+                        name=name, dtype=self.dtype, shape=ground_truth[name].shape
+                    )
+                inputs = {"X": block.var("x")}
+                fetch_list = [
+                    "y",
+                    "mean",
+                    "variance",
+                    "x@GRAD",
+                ]
+                if has_scale:
+                    inputs["Scale"] = block.var("scale")
+                    fetch_list += ["scale@GRAD"]
+                if has_bias:
+                    inputs["Bias"] = block.var("bias")
+                    fetch_list += ["bias@GRAD"]
+                layer_norm_op = block.append_op(
+                    type="layer_norm",
+                    inputs=inputs,
+                    outputs={
+                        "Y": block.var("y"),
+                        "Mean": block.var("mean"),  # share the same memory
+                        "Variance": block.var("variance"),  # share the same memory
+                    },
+                    attrs={
+                        "epsilon": epsilon,
+                        "begin_norm_axis": begin_norm_axis,
+                        "use_mkldnn": use_mkldnn,
+                    },
+                )
+                # generate backward op_desc
+                grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
+                    layer_norm_op.desc, set(), []
+                )
+                grad_op_desc = grad_op_desc_list[0]
+                new_op_desc = block.desc.append_op()
+                new_op_desc.copy_from(grad_op_desc)
+                for var_name in grad_op_desc.output_arg_names():
+                    block.desc.var(var_name.encode("ascii"))
+                grad_op_desc.infer_var_type(block.desc)
+                grad_op_desc.infer_shape(block.desc)
+                for arg in grad_op_desc.output_arg_names():
+                    grad_var = block.desc.find_var(arg.encode("ascii"))
+                    grad_var.set_dtype(core.VarDesc.VarType.FP32)
+
+                program._sync_with_cpp()
+                exe = base.Executor(place)
+                with OldIrGuard():
+                    out = exe.run(
+                        program,
+                        feed={
+                            name: var_dict[name]
+                            for name in ["x", "scale", "bias", "y@GRAD"]
+                        },
+                        fetch_list=fetch_list,
+                    )
+
+                self.__assert_close(y, out[0], "y")
+                self.__assert_close(mean, out[1], "mean")
+                self.__assert_close(variance, out[2], "variance", 1e-3)
+                self.__assert_close(x_grad, out[3], "x_grad")
+                if has_scale:
+                    self.__assert_close(
+                        scale_grad.reshape(-1),
+                        out[fetch_list.index("scale@GRAD")],
+                        "scale_grad",
+                        1e-3,
+                    )
+                if has_bias:
+                    self.__assert_close(
+                        bias_grad.reshape(-1),
+                        out[fetch_list.index("bias@GRAD")],
+                        "bias_grad",
+                    )
+
+        test_with_place(self.place, shape, begin_norm_axis)
+
+    def test_check_forward_backward_with_scale_and_bias(self):
+        self.check_forward_backward(shape=[1, 3, 4, 5], begin_norm_axis=1)
+        self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=1)
+        self.check_forward_backward(
+            shape=[2, 3, 4, 5], begin_norm_axis=1, has_scale=False, has_bias=True
+        )
+        self.check_forward_backward(
+            shape=[2, 3, 4, 5], begin_norm_axis=1, has_scale=True, has_bias=False
+        )
+        self.check_forward_backward(
+            shape=[2, 3, 4, 5], begin_norm_axis=1, has_scale=False, has_bias=False
+        )
+        self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=3)
+        self.check_forward_backward(
+            shape=[92, 513, 129], begin_norm_axis=2, y_grad_scale=0.1
+        )
+        self.check_forward_backward(shape=[3, 34, 1134], begin_norm_axis=2)
+        self.check_forward_backward(
+            shape=[92, 513, 1134], begin_norm_axis=2, y_grad_scale=0.1
+        )
+        self.check_forward_backward(
+            shape=[92, 513, 1134],
+            begin_norm_axis=2,
+            has_scale=False,
+            has_bias=True,
+            y_grad_scale=0.1,
+        )
+        self.check_forward_backward(
+            shape=[92, 513, 1134],
+            begin_norm_axis=2,
+            has_scale=True,
+            has_bias=False,
+            y_grad_scale=0.1,
+        )
+        self.check_forward_backward(
+            shape=[92, 513, 1134],
+            begin_norm_axis=2,
+            has_scale=False,
+            has_bias=False,
+            y_grad_scale=0.1,
+        )
+        self.check_forward_backward(
+            shape=[512, 1024], begin_norm_axis=1, has_scale=True, has_bias=True
+        )
+
+
+class TestFP16ScaleBiasLayerNorm(unittest.TestCase):
+    def check_main(self, x_np, weight_np, bias_np, dtype):
+        paddle.disable_static()
+
+        weight_np = weight_np.astype(dtype)
+        bias_np = bias_np.astype(dtype)
+
+        x = paddle.to_tensor(x_np)
+        weight = paddle.to_tensor(weight_np)
+        bias = paddle.to_tensor(bias_np)
+        x.stop_gradient = False
+        weight.stop_gradient = False
+        bias.stop_gradient = False
+        y = F.layer_norm(x, x.shape[1:], weight, bias)
+        x_g, w_g, b_g = paddle.grad(y, [x, weight, bias])
+        y_np = y.numpy().astype("float32")
+        x_g_np = x_g.numpy().astype("float32")
+        w_g_np = w_g.numpy().astype("float16")
+        b_g_np = b_g.numpy().astype("float32")
+
+        paddle.enable_static()
+        return y_np, x_g_np, w_g_np, b_g_np
+
+    def test_main(self):
+        paddle.set_device("metax_gpu")
+        x_np = np.random.random([10, 20]).astype("float16")
+        weight_np = np.random.random([20]).astype("float16")
+        bias_np = np.random.random([20]).astype("float16")
+
+        y_np_1, x_g_np_1, w_g_np_1, b_g_np_1 = self.check_main(
+            x_np, weight_np, bias_np, "float16"
+        )
+        y_np_2, x_g_np_2, w_g_np_2, b_g_np_2 = self.check_main(
+            x_np, weight_np, bias_np, "float32"
+        )
+
+        def assert_equal(x, y):
+            np.testing.assert_allclose(x, y)
+
+        assert_equal(y_np_1, y_np_2)
+        assert_equal(x_g_np_1, x_g_np_2)
+        assert_equal(w_g_np_1, w_g_np_2)
+        assert_equal(b_g_np_1, b_g_np_2)
+
+
+class TestGetSetKeepLayerNormScaleBiasFP32Flag(unittest.TestCase):
+    def test_main(self):
+        self.assertTrue(_keep_layer_norm_scale_bias_to_fp32())
+        _keep_layer_norm_scale_bias_to_fp32(False)
+        self.assertFalse(_keep_layer_norm_scale_bias_to_fp32())
+        _keep_layer_norm_scale_bias_to_fp32(True)
+        self.assertTrue(_keep_layer_norm_scale_bias_to_fp32())
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_matmul_op__metax.py b/backends/metax_gpu/tests/unit_test/test_matmul_op__metax.py
new file mode 100644
index 00000000000..7545e16d14d
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_matmul_op__metax.py
@@ -0,0 +1,395 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+from tests.op_test import OpTest
+import paddle
+
+paddle.enable_static()
+SEED = 2022
+
+
+def reference_matmul(X, Y, transpose_X=False, transpose_Y=False, scale=1.0):
+    """Reference forward implementation using np.matmul."""
+    # np.matmul does not support the transpose flags, so we manually
+    # transpose X and Y appropriately.
+    if transpose_X:
+        if X.ndim == 1:
+            X = X.reshape((X.size,))
+        elif X.ndim == 2:
+            X = X.T
+        else:
+            dim = [i for i in range(len(X.shape))]
+            dim[-1], dim[len(X.shape) - 2] = dim[len(X.shape) - 2], dim[-1]
+            X = np.transpose(X, tuple(dim))
+    if transpose_Y:
+        if Y.ndim == 1:
+            Y = Y.reshape((Y.size,))
+        else:
+            dim = [i for i in range(len(Y.shape))]
+            dim[-1], dim[len(Y.shape) - 2] = dim[len(Y.shape) - 2], dim[-1]
+            Y = np.transpose(Y, tuple(dim))
+
+    Out = np.matmul(X, Y)
+    if abs(scale - 1.0) > 1e-09:
+        Out = Out * scale
+    return Out
+
+
+class TestBmmOp(OpTest):
+    """
+    case 0
+    """
+
+    def set_metax_gpu(self):
+        self.__class__.use_custom_device = True
+        self.place = paddle.CustomPlace("metax_gpu", 0)
+
+    def config(self):
+        self.x_shape = (10, 2, 5)
+        self.y_shape = (10, 5, 8)
+
+    def init_kernel_type(self):
+        self.dtype = "float32"
+
+    def setUp(self):
+        self.set_metax_gpu()
+        self.init_kernel_type()
+        self.config()
+        self.op_type = "bmm"
+        x = np.random.random(self.x_shape).astype(self.dtype)
+        y = np.random.random(self.y_shape).astype(self.dtype)
+        # -0.1 ~ 0.1
+        x = -0.1 + 0.2 * x
+        y = -0.1 + 0.2 * y
+        result = reference_matmul(x, y)
+        result = result.astype(self.dtype)
+        self.inputs = {
+            "X": x,
+            "Y": y,
+        }
+        self.outputs = {"Out": result}
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-3)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ["X", "Y"], "Out")
+
+
+class TestBmmOp1(TestBmmOp):
+    """
+    case 1
+    """
+
+    def config(self):
+        self.x_shape = (40, 10, 10)
+        self.y_shape = (40, 10, 10)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-3)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ["X", "Y"], "Out")
+
+
+class TestBmmOp2(TestBmmOp):
+    """
+    case 2
+    """
+
+    def config(self):
+        self.x_shape = (4, 10, 80)
+        self.y_shape = (4, 80, 1)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(
+            self.place,
+            ["X", "Y"],
+            "Out",
+            max_relative_error=1e-2,
+        )
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-3)
+
+
+class TestMatMulOp(OpTest):
+    """
+    basic case
+    """
+
+    def setUp(self):
+        self.set_metax_gpu()
+        self.op_type = "matmul_v2"
+        self.init_dtype()
+        self.init_alpha()
+        self.config()
+
+        X = np.random.random(self.x_shape).astype(self.dtype)
+        Y = np.random.random(self.y_shape).astype(self.dtype)
+        # -0.1 ~ 0.1
+        X = -0.1 + 0.2 * X
+        Y = -0.1 + 0.2 * Y
+        Out = reference_matmul(X, Y, self.transpose_X, self.transpose_Y, self.alpha)
+        Out = Out.astype(self.dtype)
+        self.inputs = {"X": X, "Y": Y}
+        self.attrs = {
+            "trans_x": self.transpose_X,
+            "trans_y": self.transpose_Y,
+            "alpha": self.alpha,
+        }
+        self.outputs = {"Out": Out}
+
+    def set_metax_gpu(self):
+        self.__class__.use_custom_device = True
+        self.place = paddle.CustomPlace("metax_gpu", 0)
+
+    def config(self):
+        self.x_shape = (100,)
+        self.y_shape = (100,)
+        self.transpose_X = False
+        self.transpose_Y = False
+
+    def init_alpha(self):
+        self.alpha = 1.0
+
+    def init_dtype(self):
+        self.dtype = "float32"
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-7)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(self.place, ["X", "Y"], "Out")
+
+
+class TestMatMulOp1(TestMatMulOp):
+    """
+    case x_ndim == 1, y_ndim != 1
+    """
+
+    def config(self):
+        self.x_shape = (100,)
+        self.y_shape = (1, 3, 2, 100)
+        self.transpose_X = False
+        self.transpose_Y = True
+
+
+class TestMatMulOp2(TestMatMulOp):
+    """
+    case x_ndim != 1, y_ndim == 1
+    """
+
+    def config(self):
+        self.x_shape = (1, 2, 100, 1)
+        self.y_shape = (100,)
+        self.transpose_X = True
+        self.transpose_Y = False
+
+
+class TestMatMulOp3(TestMatMulOp):
+    """
+    case [M, K] x [K, N] = [M, N]
+    """
+
+    def config(self):
+        self.x_shape = (2, 100)
+        self.y_shape = (100, 2)
+        self.transpose_X = False
+        self.transpose_Y = False
+
+
+class TestMatMulOp4(TestMatMulOp):
+    """
+    case [M, K] x [K, N] = [M, N]
+    """
+
+    def config(self):
+        self.x_shape = (2, 100)
+        self.y_shape = (2, 100)
+        self.transpose_X = False
+        self.transpose_Y = True
+
+
+class TestMatMulOp5(TestMatMulOp):
+    """
+    case [M, K] x [K, N] = [M, N]
+    """
+
+    def config(self):
+        self.x_shape = (100, 2)
+        self.y_shape = (100, 2)
+        self.transpose_X = True
+        self.transpose_Y = False
+
+
+class TestMatMulOp6(TestMatMulOp):
+    """
+    case [B, M, K] x [K, N] =  [B, M, N]
+    """
+
+    def config(self):
+        self.x_shape = (2, 2, 25)
+        self.y_shape = (25, 4)
+        self.transpose_X = False
+        self.transpose_Y = False
+
+
+class TestMatMulOp7(TestMatMulOp):
+    """
+    case [B, M, K] x [K, N] =  [B, M, N]
+    """
+
+    def config(self):
+        self.x_shape = (1, 4, 25)
+        self.y_shape = (4, 25)
+        self.transpose_X = False
+        self.transpose_Y = True
+
+
+class TestMatMulOp8(TestMatMulOp):
+    """
+    case [B, M, K] x [K, N] =  [B, M, N]
+    """
+
+    def config(self):
+        self.x_shape = (1, 25, 4)
+        self.y_shape = (25, 4)
+        self.transpose_X = True
+        self.transpose_Y = False
+
+
+class TestMatMulOp9(TestMatMulOp):
+    """
+    case [B, M, K] x  [B, K, N] = [B, M, N]
+    """
+
+    def config(self):
+        self.x_shape = (2, 5, 10)
+        self.y_shape = (2, 10, 5)
+        self.transpose_X = False
+        self.transpose_Y = False
+
+
+class TestMatMulOp10(TestMatMulOp):
+    """
+    case [B, M, K] x  [B, K, N] = [B, M, N]
+    """
+
+    def config(self):
+        self.x_shape = (2, 10, 5)
+        self.y_shape = (2, 10, 5)
+        self.transpose_X = True
+        self.transpose_Y = False
+
+
+class TestMatMulOp11(TestMatMulOp):
+    """
+    case [B, M, K] x  [B, K, N] = [B, M, N]
+    """
+
+    def config(self):
+        self.x_shape = (2, 5, 10)
+        self.y_shape = (2, 5, 10)
+        self.transpose_X = False
+        self.transpose_Y = True
+
+
+class TestMatMulOp12(TestMatMulOp):
+    """
+    case to check the gradient for special case
+    """
+
+    def config(self):
+        self.x_shape = 100
+        self.y_shape = (1, 2, 2, 100, 2)
+        self.transpose_X = False
+        self.transpose_Y = False
+
+
+class TestMatMulOp13(TestMatMulOp):
+    """
+    case to check the gradient for special case
+    """
+
+    def config(self):
+        self.x_shape = (2, 1, 100)
+        self.y_shape = 100
+        self.transpose_X = False
+        self.transpose_Y = False
+
+
+# TODO(metax_gpu): alpha will be supported in next version
+# --------------------test matmul alpha--------------------
+# def create_test_alpha_class(parent):
+#     class TestMatMulOpAlphaCase(parent):
+#         def init_alpha(self):
+#             self.alpha = 0.125
+
+#     cls_name = "{0}_{1}".format(parent.__name__, "Alpha")
+#     TestMatMulOpAlphaCase.__name__ = cls_name
+#     globals()[cls_name] = TestMatMulOpAlphaCase
+
+# create_test_alpha_class(TestMatMulOp)
+# create_test_alpha_class(TestMatMulOp1)
+# create_test_alpha_class(TestMatMulOp2)
+# create_test_alpha_class(TestMatMulOp3)
+# create_test_alpha_class(TestMatMulOp4)
+# create_test_alpha_class(TestMatMulOp5)
+# create_test_alpha_class(TestMatMulOp6)
+# create_test_alpha_class(TestMatMulOp9)
+# create_test_alpha_class(TestMatMulOp10)
+# create_test_alpha_class(TestMatMulOp11)
+# create_test_alpha_class(TestMatMulOp12)
+# create_test_alpha_class(TestMatMulOp13)
+
+
+# --------------------test matmul fp16--------------------
+def create_test_fp16_class(parent, atol=0.001, max_relative_error=2.5):
+    class TestMatMulOpFp16Case(parent):
+        def init_kernel_type(self):
+            self.dtype = np.float16
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place, atol=atol)
+
+        def test_check_grad(self):
+            self.check_grad_with_place(
+                self.place, ["X", "Y"], "Out", max_relative_error=max_relative_error
+            )
+
+    cls_name = "{0}_{1}".format(parent.__name__, "Fp16")
+    TestMatMulOpFp16Case.__name__ = cls_name
+    globals()[cls_name] = TestMatMulOpFp16Case
+
+
+create_test_fp16_class(TestMatMulOp)
+create_test_fp16_class(TestMatMulOp1)
+create_test_fp16_class(TestMatMulOp2)
+create_test_fp16_class(TestMatMulOp3)
+create_test_fp16_class(TestMatMulOp4)
+create_test_fp16_class(TestMatMulOp5)
+create_test_fp16_class(TestMatMulOp6)
+create_test_fp16_class(TestMatMulOp9)
+create_test_fp16_class(TestMatMulOp10)
+create_test_fp16_class(TestMatMulOp11)
+create_test_fp16_class(TestMatMulOp12)
+create_test_fp16_class(TestMatMulOp13)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_nonzero_api_metax.py b/backends/metax_gpu/tests/unit_test/test_nonzero_api_metax.py
new file mode 100644
index 00000000000..c9bccd2abb3
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_nonzero_api_metax.py
@@ -0,0 +1,220 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from op_test import OpTest, convert_float_to_uint16
+
+import paddle
+from paddle import base
+from paddle.base import Program, program_guard
+
+
+def call_nonzero(x):
+    input = paddle.to_tensor(x)
+    return paddle.nonzero(x=input)
+
+
+class TestNonZeroAPI(unittest.TestCase):
+    def test_nonzero_api_as_tuple(self):
+        paddle.enable_static()
+        data = np.array([[1, 0], [0, 1]], dtype="float32")
+        with program_guard(Program(), Program()):
+            x = paddle.static.data(name="x", shape=[-1, 2], dtype="float32")
+            if not paddle.framework.use_pir_api():
+                x.desc.set_need_check_feed(False)
+            y = paddle.nonzero(x, as_tuple=True)
+            self.assertEqual(type(y), tuple)
+            self.assertEqual(len(y), 2)
+            z = paddle.concat(list(y), axis=0)
+            exe = base.Executor(base.CPUPlace())
+
+            (res,) = exe.run(feed={"x": data}, fetch_list=[z], return_numpy=False)
+        expect_out = np.array([0, 1, 0, 1])
+        np.testing.assert_allclose(expect_out, np.array(res), rtol=1e-05)
+
+        data = np.array([1, 1, 0], dtype="float32")
+        with program_guard(Program(), Program()):
+            x = paddle.static.data(name="x", shape=[-1], dtype="float32")
+            if not paddle.framework.use_pir_api():
+                x.desc.set_need_check_feed(False)
+            y = paddle.nonzero(x, as_tuple=True)
+            self.assertEqual(type(y), tuple)
+            self.assertEqual(len(y), 1)
+            z = paddle.concat(list(y), axis=0)
+            exe = base.Executor(base.CPUPlace())
+            (res,) = exe.run(feed={"x": data}, fetch_list=[z], return_numpy=False)
+        expect_out = np.array([0, 1])
+        np.testing.assert_allclose(expect_out, np.array(res), rtol=1e-05)
+
+        data = np.zeros([10, 3, 0], dtype="float32")
+        with program_guard(Program(), Program()):
+            x = paddle.static.data(name="x", shape=[10, 3, 0], dtype="float32")
+            if not paddle.framework.use_pir_api():
+                x.desc.set_need_check_feed(False)
+            y = paddle.nonzero(x, as_tuple=True)
+            self.assertEqual(type(y), tuple)
+            self.assertEqual(len(y), 3)
+            expect_out = np.zeros([0])
+            for item in y:
+                np.testing.assert_array_equal(expect_out, item)
+
+    def test_nonzero_api(self):
+        paddle.enable_static()
+        data = np.array([[1, 0], [0, 1]], dtype="float32")
+        with program_guard(Program(), Program()):
+            x = paddle.static.data(name="x", shape=[-1, 2], dtype="float32")
+            if not paddle.framework.use_pir_api():
+                x.desc.set_need_check_feed(False)
+            y = paddle.nonzero(x)
+            exe = base.Executor(base.CPUPlace())
+            (res,) = exe.run(feed={"x": data}, fetch_list=[y], return_numpy=False)
+        expect_out = np.array([[0, 0], [1, 1]])
+        np.testing.assert_allclose(expect_out, np.array(res), rtol=1e-05)
+
+        data = np.array([1, 1, 0], dtype="float32")
+        with program_guard(Program(), Program()):
+            x = paddle.static.data(name="x", shape=[-1], dtype="float32")
+            if not paddle.framework.use_pir_api():
+                x.desc.set_need_check_feed(False)
+            y = paddle.nonzero(x)
+            exe = base.Executor(base.CPUPlace())
+            (res,) = exe.run(feed={"x": data}, fetch_list=[y], return_numpy=False)
+        expect_out = np.array([[0], [1]])
+        np.testing.assert_allclose(expect_out, np.array(res), rtol=1e-05)
+
+    def test_dygraph_api(self):
+        data_x = np.array([[True, False], [False, True]])
+        with base.dygraph.guard():
+            x = paddle.to_tensor(data_x)
+            z = paddle.nonzero(x)
+            np_z = z.numpy()
+        expect_out = np.array([[0, 0], [1, 1]])
+
+
+# Base case
+class TestNonzeroOp(OpTest):
+    def setUp(self):
+        """Test where_index op with random value"""
+        np.random.seed(2023)
+        self.op_type = "where_index"
+        self.python_api = call_nonzero
+        self.init_shape()
+        self.init_dtype()
+
+        self.inputs = self.create_inputs()
+        self.outputs = self.return_outputs()
+
+    def test_check_output(self):
+        self.check_output(check_pir=True, check_symbol_infer=False)
+
+    def init_shape(self):
+        self.shape = [8, 8]
+
+    def init_dtype(self):
+        self.dtype = np.float64
+
+    def create_inputs(self):
+        return {"Condition": np.random.randint(5, size=self.shape).astype(self.dtype)}
+
+    def return_outputs(self):
+        return {"Out": np.transpose(np.nonzero(self.inputs["Condition"]))}
+
+
+class TestNonzeroComplex64Op(TestNonzeroOp):
+    def init_shape(self):
+        self.shape = [1, 2, 3]
+
+    def init_dtype(self):
+        self.dtype = np.complex64
+
+
+class TestNonzeroComplex128Op(TestNonzeroOp):
+    def init_shape(self):
+        self.shape = [1, 2, 3]
+
+    def init_dtype(self):
+        self.dtype = np.complex128
+
+
+class TestNonzeroFP32Op(TestNonzeroOp):
+    def init_shape(self):
+        self.shape = [2, 10, 2]
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+
+class TestNonzeroFP16Op(TestNonzeroOp):
+    def init_shape(self):
+        self.shape = [3, 4, 7]
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+class TestNonzeroBF16(OpTest):
+    def setUp(self):
+        """Test where_index op with bfloat16 dtype"""
+        np.random.seed(2023)
+        self.op_type = "where_index"
+        self.python_api = call_nonzero
+        self.init_shape()
+        self.init_dtype()
+
+        self.inputs = self.create_inputs()
+        self.outputs = self.return_outputs()
+
+    def test_check_output(self):
+        self.check_output(check_pir=True, check_symbol_infer=False)
+
+    def init_shape(self):
+        self.shape = [12, 9]
+
+    def init_dtype(self):
+        self.dtype = np.uint16
+
+    def create_inputs(self):
+        return {
+            "Condition": convert_float_to_uint16(
+                np.random.randint(5, size=self.shape).astype(np.float32)
+            )
+        }
+
+    def return_outputs(self):
+        return {"Out": np.transpose(np.nonzero(self.inputs["Condition"]))}
+
+
+class TestZeroSizeOp(TestNonzeroOp):
+    def init_shape(self):
+        self.shape = [0, 10]
+
+    def init_dtype(self):
+        self.dtype = np.float64
+
+
+class TestZeroSizeOpCase2(TestNonzeroOp):
+    def init_shape(self):
+        self.shape = [0, 10]
+
+    def init_dtype(self):
+        self.dtype = np.float64
+
+    def test_check_output(self):
+        self.check_output(check_pir=True, check_symbol_infer=True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_p_norm_op_metax.py b/backends/metax_gpu/tests/unit_test/test_p_norm_op_metax.py
new file mode 100644
index 00000000000..c1bc46517b6
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_p_norm_op_metax.py
@@ -0,0 +1,215 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+import paddle
+from tests.op_test import OpTest
+
+paddle.enable_static()
+
+
+def p_norm(x, axis, porder, keepdims=False, reduce_all=False):
+    r = []
+    if axis is None or reduce_all:
+        x = x.flatten()
+        if porder == np.inf:
+            r = np.amax(np.abs(x), keepdims=keepdims)
+        elif porder == -np.inf:
+            r = np.amin(np.abs(x), keepdims=keepdims)
+        else:
+            r = np.linalg.norm(x, ord=porder, keepdims=keepdims)
+    elif isinstance(axis, list or tuple) and len(axis) == 2:
+        if porder == np.inf:
+            axis = tuple(axis)
+            r = np.amax(np.abs(x), axis=axis, keepdims=keepdims)
+        elif porder == -np.inf:
+            axis = tuple(axis)
+            r = np.amin(np.abs(x), axis=axis, keepdims=keepdims)
+        elif porder == 0:
+            axis = tuple(axis)
+            r = x.astype(bool)
+            r = np.sum(r, axis, keepdims=keepdims)
+        elif porder == 1:
+            axis = tuple(axis)
+            r = np.sum(np.abs(x), axis, keepdims=keepdims)
+        else:
+            axis = tuple(axis)
+            xp = np.power(np.abs(x), porder)
+            s = np.sum(xp, axis=axis, keepdims=keepdims)
+            r = np.power(s, 1.0 / porder)
+    else:
+        if isinstance(axis, list):
+            axis = tuple(axis)
+        r = np.linalg.norm(x, ord=porder, axis=axis, keepdims=keepdims)
+    r = r.astype(x.dtype)
+
+    return r
+
+
+class TestPnormOp(OpTest):
+    def set_metax_gpu(self):
+        self.__class__.use_custom_device = True
+
+    def setUp(self):
+        self.set_metax_gpu()
+        self.op_type = "p_norm"
+        self.init_test_case()
+        x = (np.random.random(self.shape) + 0.5).astype(self.dtype)
+        norm = p_norm(x, self.axis, self.porder, self.keepdim)
+        self.inputs = {"X": x}
+        self.attrs = {
+            "epsilon": self.epsilon,
+            "axis": self.axis,
+            "keepdim": self.keepdim,
+            "porder": float(self.porder),
+        }
+        self.outputs = {"Out": norm}
+        self.gradient = self.calc_gradient()
+
+    def test_check_output(self):
+        if self.dtype == "float16":
+            self.check_output_with_place(paddle.CustomPlace("metax_gpu", 0), atol=5e-3)
+        else:
+            self.check_output_with_place(paddle.CustomPlace("metax_gpu", 0))
+
+    def test_check_grad(self):
+        self.check_grad_with_place(
+            paddle.CustomPlace("metax_gpu", 0),
+            ["X"],
+            "Out",
+            user_defined_grads=self.gradient,
+        )
+
+    def init_test_case(self):
+        self.shape = [2, 3, 4, 5]
+        self.axis = 1
+        self.epsilon = 1e-12
+        self.porder = 2.0
+        self.keepdim = False
+        self.init_dtype()
+
+    def init_dtype(self):
+        self.dtype = "float32"
+
+    def calc_gradient(self):
+        self.attrs = {
+            "epsilon": self.epsilon,
+            "axis": self.axis,
+            "keepdim": self.keepdim,
+            "porder": float(self.porder),
+        }
+        x = self.inputs["X"]
+        porder = self.attrs["porder"]
+        axis = self.attrs["axis"]
+        if porder == 0:
+            grad = np.zeros(x.shape).astype(x.dtype)
+        elif porder in [float("inf"), float("-inf")]:
+            norm = p_norm(x, axis=axis, porder=porder, keepdims=True)
+            x_abs = np.abs(x)
+            grad = np.sign(x)
+            grad[x_abs != norm] = 0.0
+        else:
+            norm = p_norm(x, axis=axis, porder=porder, keepdims=True)
+            grad = (
+                np.power(norm, 1 - porder)
+                * np.power(np.abs(x), porder - 1)
+                * np.sign(x)
+            )
+
+        numel = 1
+        for s in x.shape:
+            numel *= s
+        numel /= x.shape[axis]
+        return [grad.astype(x.dtype) * 1 / numel]
+
+
+class TestPnormOp2(TestPnormOp):
+    def init_test_case(self):
+        self.shape = [3, 20, 3]
+        self.axis = 2
+        self.epsilon = 1e-12
+        self.porder = 2.0
+        self.keepdim = True
+        self.init_dtype()
+
+
+# class TestPnormOp3(TestPnormOp):
+#     def init_test_case(self):
+#         self.shape = [3, 20, 3]
+#         self.axis = 2
+#         self.epsilon = 1e-12
+#         self.porder = np.inf
+#         self.keepdim = True
+#         self.init_dtype()
+
+
+# class TestPnormOp4(TestPnormOp3):
+#     def init_test_case(self):
+#         self.shape = [3, 20, 3]
+#         self.axis = 2
+#         self.epsilon = 1e-12
+#         self.porder = -np.inf
+#         self.keepdim = True
+#         self.init_dtype()
+
+
+class TestPnormOp5(TestPnormOp):
+    def init_test_case(self):
+        self.shape = [3, 20, 3]
+        self.axis = 2
+        self.epsilon = 1e-12
+        self.porder = 0
+        self.keepdim = True
+        self.init_dtype()
+
+
+# class TestPnormOp6(TestPnormOp):
+#     def init_test_case(self):
+#         self.shape = [2, 3, 4, 5]
+#         self.axis = 1
+#         self.epsilon = 1e-12
+#         self.porder = 0.5
+#         self.keepdim = False
+#         self.init_dtype()
+
+
+class TestPnormOpfp16(TestPnormOp):
+    def init_dtype(self):
+        self.dtype = "float16"
+
+
+class TestPnormOp2fp16(TestPnormOp2):
+    def init_dtype(self):
+        self.dtype = "float16"
+
+
+# class TestPnormOp3fp16(TestPnormOp3):
+#     def init_dtype(self):
+#         self.dtype = "float16"
+
+
+# class TestPnormOp4fp16(TestPnormOp4):
+#     def init_dtype(self):
+#         self.dtype = "float16"
+
+
+class TestPnormOp5fp16(TestPnormOp5):
+    def init_dtype(self):
+        self.dtype = "float16"
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_squeeze_op_metax.py b/backends/metax_gpu/tests/unit_test/test_squeeze_op_metax.py
new file mode 100644
index 00000000000..c67e807397c
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_squeeze_op_metax.py
@@ -0,0 +1,125 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+
+# import sys
+
+# sys.path.append("..")
+
+import numpy as np
+
+import paddle
+from tests.op_test import OpTest
+
+paddle.enable_static()
+
+
+# Correct: General.
+class TestSqueezeOp(OpTest):
+    def setUp(self):
+        self.op_type = "squeeze2"
+        self.init_test_case()
+        self.set_metax_gpu()
+        self.inputs = {"X": np.random.random(self.ori_shape).astype("float64")}
+        self.init_attrs()
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.new_shape),
+        }
+
+    def set_metax_gpu(self):
+        self.__class__.use_custom_device = True
+        self.place = paddle.CustomPlace("metax_gpu", 0)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ["X"], "Out")
+
+    def init_test_case(self):
+        self.ori_shape = (1, 3, 1, 40)
+        self.axes = (0, 2)
+        self.new_shape = (3, 40)
+
+    def init_attrs(self):
+        self.attrs = {"axes": self.axes}
+
+
+# class TestSqueezeBF16Op(OpTest):
+#     def setUp(self):
+#         self.op_type = "squeeze2"
+#         self.dtype = np.uint16
+#         self.init_test_case()
+#         self.set_metax_gpu()
+#         x = np.random.random(self.ori_shape).astype("float32")
+#         out = x.reshape(self.new_shape)
+#         self.inputs = {"X": convert_float_to_uint16(x)}
+#         self.init_attrs()
+#         self.outputs = {"Out": convert_float_to_uint16(out)}
+
+#     def set_metax_gpu(self):
+#         self.__class__.use_custom_device = True
+#         self.place = paddle.CustomPlace("metax_gpu", 0)
+
+#     def test_check_output(self):
+#         self.check_output()
+
+#     def test_check_grad(self):
+#         self.check_grad(["X"], "Out")
+
+#     def init_test_case(self):
+#         self.ori_shape = (1, 3, 1, 40)
+#         self.axes = (0, 2)
+#         self.new_shape = (3, 40)
+
+#     def init_attrs(self):
+#         self.attrs = {"axes": self.axes}
+
+
+# Correct: There is mins axis.
+class TestSqueezeOp1(TestSqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (1, 3, 1, 40)
+        self.axes = (0, -2)
+        self.new_shape = (3, 40)
+
+
+# Correct: No axes input.
+class TestSqueezeOp2(TestSqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (1, 20, 1, 5)
+        self.axes = ()
+        self.new_shape = (20, 5)
+
+
+# Correct: Just part of axes be squeezed.
+class TestSqueezeOp3(TestSqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (6, 1, 5, 1, 4, 1)
+        self.axes = (1, -1)
+        self.new_shape = (6, 5, 1, 4)
+
+
+# Correct: The demension of axis is not of size 1 remains unchanged.
+class TestSqueezeOp4(TestSqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (6, 1, 5, 1, 4, 1)
+        self.axes = (1, 2)
+        self.new_shape = (6, 5, 1, 4, 1)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_swiglu_metax.py b/backends/metax_gpu/tests/unit_test/test_swiglu_metax.py
new file mode 100644
index 00000000000..40e46e70a21
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_swiglu_metax.py
@@ -0,0 +1,295 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from op_test import OpTest
+
+import paddle
+import paddle.distributed as dist
+import paddle.nn.functional as F
+from paddle import _C_ops
+from paddle.base import core
+from paddle.distributed.auto_parallel.static.dist_attribute import (
+    DistTensorSpec,
+    TensorDistAttr,
+)
+from paddle.incubate.nn.functional import swiglu as fused_swiglu_impl
+
+
+def swiglu(x, y, out_grad):
+    if isinstance(x, np.ndarray):
+        x = paddle.to_tensor(x)
+        y = paddle.to_tensor(y)
+        out_grad = paddle.to_tensor(out_grad)
+
+    origin_x = x.detach().clone()
+    origin_x.stop_gradient = False
+    x = origin_x
+
+    origin_y = y.detach().clone()
+    origin_y.stop_gradient = False
+    y = origin_y
+
+    dtype = x.dtype
+    need_convert = False
+    assert dtype == y.dtype
+    output_dtype = dtype
+    if paddle.is_compiled_with_cuda():
+        if dtype in [paddle.float16, paddle.bfloat16]:
+            output_dtype = paddle.float32
+            x = x.astype(output_dtype)
+            y = y.astype(output_dtype)
+            need_convert = True
+
+    out = F.silu(x) * y
+    if need_convert:
+        out = out.astype(dtype)
+    out.backward(out_grad)
+    ret = [
+        out.astype(output_dtype),
+        origin_x.grad.astype(output_dtype),
+        origin_y.grad.astype(output_dtype),
+    ]
+    return ret
+
+
+def fused_swiglu(x, y, out_grad):
+    x = x.detach().clone()
+    x.stop_gradient = False
+    if y is not None:
+        y = y.detach().clone()
+        y.stop_gradient = False
+    out = fused_swiglu_impl(x, y)
+    out.backward(out_grad)
+
+    output_dtype = x.dtype
+    if paddle.is_compiled_with_cuda():
+        if x.dtype in [paddle.float16, paddle.bfloat16]:
+            output_dtype = paddle.float32
+    ret = [
+        out.astype(output_dtype),
+    ]
+    if y is not None:
+        x_grad, y_grad = x.grad, y.grad
+    else:
+        x_grad, y_grad = paddle.split(x.grad, 2, axis=-1)
+
+    ret.append(x_grad.astype(output_dtype))
+    ret.append(y_grad.astype(output_dtype))
+    return ret
+
+
+tol_map = {
+    paddle.float64: [1e-8, 1e-8],
+    paddle.float32: [1e-6, 1e-6],
+    paddle.float16: [1e-3, 1e-3],
+    paddle.bfloat16: [1e-3, 1e-3],
+}
+
+
+class TestSwiGLUDygraph(unittest.TestCase):
+    def check_dygraph_impl(self, device, shape, dtype):
+        x = paddle.randn(shape, dtype=dtype)
+        y = paddle.randn(shape, dtype=dtype)
+        out_grad = paddle.randn(shape, dtype=dtype)
+
+        ret1 = swiglu(x, y, out_grad)
+        ret2 = fused_swiglu(x, y, out_grad)
+        ret3 = fused_swiglu(paddle.concat([x, y], axis=-1), None, out_grad)
+
+        atol, rtol = tol_map[dtype]
+        err_msg = f"Failed when device = {device}, dtype = {dtype}, shape = {shape}"
+        for t1, t2, t3 in zip(ret1, ret2, ret3):
+            t1, t2, t3 = t1.numpy(), t2.numpy(), t3.numpy()
+            np.testing.assert_allclose(t1, t2, atol=atol, rtol=rtol, err_msg=err_msg)
+            np.testing.assert_equal(t2, t3, err_msg=err_msg)
+
+    def check_dygraph(self, shape):
+        metas = [("cpu", paddle.float32), ("cpu", paddle.float64)]
+        if paddle.is_compiled_with_cuda():
+            metas.append(("gpu", paddle.float32))
+            metas.append(("gpu", paddle.float64))
+            metas.append(("gpu", paddle.float16))
+            prop = paddle.device.cuda.get_device_properties()
+            if prop.major >= 8:
+                metas.append(("gpu", paddle.bfloat16))
+
+        for device, dtype in metas:
+            origin_device = paddle.get_device()
+            paddle.set_device(device)
+            for with_split in [True]:
+                self.check_dygraph_impl(device, shape, dtype)
+            paddle.set_device(origin_device)
+
+    def check_static_graph(self, shape, dtype="float32"):
+        x = paddle.static.data(name="x", shape=shape, dtype=dtype)
+        y = paddle.static.data(name="y", shape=shape, dtype=dtype)
+        concated_x = paddle.static.data(
+            name="concated_x",
+            shape=[*shape[:-1], shape[-1] * 2],
+            dtype=dtype,
+        )
+        out1 = fused_swiglu_impl(x, y)
+        out2 = fused_swiglu_impl(concated_x)
+
+        concated_x_np = np.random.random(concated_x.shape).astype(dtype)
+        x_np, y_np = np.split(concated_x_np, 2, axis=-1)
+
+        exe = paddle.static.Executor()
+        t1, t2 = exe.run(
+            feed={"x": x_np, "y": y_np, "concated_x": concated_x_np},
+            fetch_list=[out1, out2],
+        )
+        np.testing.assert_equal(t1, t2)
+
+    def check_main(self, shape):
+        self.check_dygraph(shape)
+        paddle.enable_static()
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
+            self.check_static_graph(shape)
+        paddle.disable_static()
+
+    def test_main(self):
+        self.check_main([8, 100])
+        self.check_main([4, 101])
+
+
+class TestSwigluOp(OpTest):
+    def config(self):
+        self.x_shape = (8, 128)
+        self.check_auto_parallel = True
+
+    def setUp(self):
+        self.config()
+        self.op_type = "swiglu"
+        self.prim_op_type = "comp"
+        self.python_api = fused_swiglu_impl
+        self.public_python_api = fused_swiglu_impl
+        x = np.random.uniform(-1, 1, self.x_shape).astype("float64")
+        y = np.random.uniform(-1, 1, self.x_shape).astype("float64")
+        out_grad = np.random.uniform(-1, 1, self.x_shape).astype("float64")
+        res = swiglu(x, y, out_grad)
+        self.inputs = {"x": x, "y": y}
+        self.outputs = {"out": res[0].numpy()}
+        self.placements = {
+            "x": [dist.Shard(1)],
+            "y": [dist.Shard(1)],
+            "out": [dist.Shard(1)],
+        }
+
+    def test_check_output(self):
+        self.check_output(check_prim_pir=True)
+
+    def test_check_grad(self):
+        self.check_grad(
+            ["x", "y"],
+            "out",
+            check_auto_parallel=self.check_auto_parallel,
+            check_dygraph=1,
+            check_prim_pir=True,
+        )
+
+
+class TestSwigluOp2(TestSwigluOp):
+    def setUp(self):
+        self.config()
+        self.op_type = "swiglu"
+        self.prim_op_type = "comp"
+        self.python_api = fused_swiglu_impl
+        self.public_python_api = fused_swiglu_impl
+        x = np.random.uniform(-1, 1, self.x_shape).astype("float64")
+        tmp_inputs = np.split(x, 2, axis=-1)
+        x = tmp_inputs[0]
+        y = tmp_inputs[1]
+        out_grad = np.random.uniform(-1, 1, x.shape).astype("float64")
+        res = swiglu(x, y, out_grad)
+        self.inputs = {"x": x, "y": y}
+        self.outputs = {"out": res[0].numpy()}
+        self.placements = {
+            "x": [dist.Shard(1)],
+            "y": [dist.Shard(1)],
+            "out": [dist.Shard(1)],
+        }
+
+
+@unittest.skipIf(
+    not paddle.base.core.is_compiled_with_dist(),
+    "The spmd rule is should be tested with distributed=ON",
+)
+class TestSwigluSpmd(unittest.TestCase):
+    def setUp(self):
+        self.kernel = "swiglu"
+        self.rule = paddle.base.core.get_phi_spmd_rule(self.kernel)
+        x_shape = [64, 32]
+        process_mesh = dist.ProcessMesh(mesh=[0, 1, 2, 3])
+        x_tensor_dist_attr = TensorDistAttr()
+        x_tensor_dist_attr.dims_mapping = [-1, 0]
+        x_tensor_dist_attr.process_mesh = process_mesh
+        self.x_dist_tensor_spec = DistTensorSpec(x_shape, x_tensor_dist_attr)
+        self.y_dist_tensor_spec = DistTensorSpec(x_shape, x_tensor_dist_attr)
+        self.out_dist_tensor_spec = DistTensorSpec(self.x_dist_tensor_spec)
+
+    def test_input_x_y(self):
+        result_dist_attrs = self.rule.infer_forward(
+            self.x_dist_tensor_spec, self.y_dist_tensor_spec
+        )
+        inferred_input_dist_attrs = result_dist_attrs[0]
+        inferred_output_dist_attrs = result_dist_attrs[1]
+        self.assertEqual(len(result_dist_attrs), 2)
+        self.assertEqual(len(inferred_input_dist_attrs), 2)
+        self.assertEqual(len(inferred_output_dist_attrs), 1)
+        self.assertEqual(inferred_output_dist_attrs[0].dims_mapping, [-1, 0])
+
+    def test_input_x_unshard_last_dim(self):
+        x_shape = [64, 32]
+        process_mesh = dist.ProcessMesh(mesh=[0, 1, 2, 3])
+        x_tensor_dist_attr = TensorDistAttr()
+        x_tensor_dist_attr.dims_mapping = [0, -1]
+        x_tensor_dist_attr.process_mesh = process_mesh
+        self.x_dist_tensor_spec = DistTensorSpec(x_shape, x_tensor_dist_attr)
+
+        result_dist_attrs = self.rule.infer_forward(
+            self.x_dist_tensor_spec, DistTensorSpec()
+        )
+        inferred_input_dist_attrs = result_dist_attrs[0]
+        inferred_output_dist_attrs = result_dist_attrs[1]
+        self.assertEqual(len(result_dist_attrs), 2)
+        self.assertEqual(len(inferred_input_dist_attrs), 2)
+        self.assertEqual(len(inferred_output_dist_attrs), 1)
+        self.assertEqual(inferred_output_dist_attrs[0].dims_mapping, [0, -1])
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(), "mamtul 0 size only with in cuda")
+class TestSwiglu0SizeDygraph(unittest.TestCase):
+    def test_swiglu(self):
+        x = paddle.ones([0, 128], dtype="float32")
+        y = paddle.ones([0, 128], dtype="float32")
+        x.stop_gradient = False
+        y.stop_gradient = False
+        out = fused_swiglu_impl(x, y)
+
+        dz = paddle.ones([0, 128], dtype="float32")
+
+        out = _C_ops.swiglu_grad(x, y, dz)
+
+        self.assertEqual(out[0].shape, x.shape)
+        self.assertEqual(out[1].shape, y.shape)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_top_p_sampling.py b/backends/metax_gpu/tests/unit_test/test_top_p_sampling.py
new file mode 100644
index 00000000000..4369972255d
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_top_p_sampling.py
@@ -0,0 +1,162 @@
+# 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved.
+# #   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+
+
+def TopPProcess(probs, top_p):
+    sorted_probs = paddle.sort(probs, descending=True)
+    sorted_indices = paddle.argsort(probs, descending=True)
+    cumulative_probs = paddle.cumsum(sorted_probs, axis=-1)
+
+    # Remove tokens with cumulative probs above the top_p, But keep at
+    # least min_tokens_to_keep tokens
+    sorted_indices_to_remove = cumulative_probs > top_p
+
+    # Keep the first token
+    sorted_indices_to_remove = paddle.cast(sorted_indices_to_remove, dtype="int64")
+
+    sorted_indices_to_remove = paddle.static.setitem(
+        sorted_indices_to_remove,
+        (slice(None), slice(1, None)),
+        sorted_indices_to_remove[:, :-1].clone(),
+    )
+    sorted_indices_to_remove = paddle.static.setitem(
+        sorted_indices_to_remove, (slice(None), 0), 0
+    )
+
+    # Scatter sorted tensors to original indexing
+    sorted_indices = (
+        sorted_indices + paddle.arange(probs.shape[0]).unsqueeze(-1) * probs.shape[-1]
+    )
+    condition = paddle.scatter(
+        sorted_indices_to_remove.flatten(),
+        sorted_indices.flatten(),
+        sorted_indices_to_remove.flatten(),
+    )
+    condition = paddle.cast(condition, "bool").reshape(probs.shape)
+    probs = paddle.where(condition, paddle.full_like(probs, 0.0), probs)
+    next_tokens = paddle.multinomial(probs)
+    next_scores = paddle.index_sample(probs, next_tokens)
+    return next_scores, next_tokens
+
+
+class TestTopPAPI(unittest.TestCase):
+    def setUp(self):
+        self.topp = 0.0
+        self.seed = 6688
+        self.batch_size = 3
+        self.vocab_size = 10000
+        self.dtype = "float32"
+        self.input_data = np.random.rand(self.batch_size, self.vocab_size)
+
+    def run_dygraph(self, place):
+        with paddle.base.dygraph.guard(place):
+            input_tensor = paddle.to_tensor(self.input_data, self.dtype)
+            topp_tensor = paddle.to_tensor(
+                [
+                    self.topp,
+                ]
+                * self.batch_size,
+                self.dtype,
+            ).reshape((-1, 1))
+
+            # test case for basic test case 1
+            paddle_result = paddle.tensor.top_p_sampling(
+                input_tensor, topp_tensor, seed=self.seed
+            )
+            ref_res = TopPProcess(input_tensor, self.topp)
+
+            np.testing.assert_allclose(
+                paddle_result[0].numpy(), ref_res[0].numpy(), rtol=1e-05
+            )
+            np.testing.assert_allclose(
+                paddle_result[1].numpy().flatten(),
+                ref_res[1].numpy().flatten(),
+                rtol=0,
+            )
+
+            # test case for basic test case 1
+            paddle_result = paddle.tensor.top_p_sampling(
+                input_tensor,
+                topp_tensor,
+                seed=-1,
+                k=5,
+                mode="non-truncated",
+                return_top=True,
+            )
+            ref_res = TopPProcess(input_tensor, self.topp)
+
+            np.testing.assert_allclose(
+                paddle_result[0].numpy(), ref_res[0].numpy(), rtol=1e-05
+            )
+            np.testing.assert_allclose(
+                paddle_result[1].numpy().flatten(),
+                ref_res[1].numpy().flatten(),
+                rtol=0,
+            )
+
+    def run_static(self, place):
+        paddle.enable_static()
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
+            input_tensor = paddle.static.data(
+                name="x", shape=[6, 1030], dtype=self.dtype
+            )
+            topp_tensor = paddle.static.data(
+                name="topp", shape=[6, 1], dtype=self.dtype
+            )
+            result = paddle.tensor.top_p_sampling(
+                input_tensor, topp_tensor, seed=self.seed
+            )
+            ref_res = TopPProcess(input_tensor, self.topp)
+            exe = paddle.static.Executor(place)
+            input_data = np.random.rand(6, 1030).astype(self.dtype)
+            paddle_result = exe.run(
+                feed={
+                    "x": input_data,
+                    "topp": np.array(
+                        [
+                            self.topp,
+                        ]
+                        * 6
+                    ).astype(self.dtype),
+                },
+                fetch_list=[
+                    result[0],
+                    result[1],
+                    ref_res[0],
+                    ref_res[1],
+                ],
+            )
+            np.testing.assert_allclose(paddle_result[0], paddle_result[2], rtol=1e-05)
+            np.testing.assert_allclose(paddle_result[1], paddle_result[3], rtol=1e-05)
+
+    def test_dygraph(self):
+        place = paddle.CustomPlace("metax_gpu", 0)
+        self.run_dygraph(place)
+
+    def test_static(self):
+        place = paddle.CustomPlace("metax_gpu", 0)
+        self.run_static(place)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_unsqueeze_op_metax.py b/backends/metax_gpu/tests/unit_test/test_unsqueeze_op_metax.py
new file mode 100644
index 00000000000..ff22c2c9ac9
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_unsqueeze_op_metax.py
@@ -0,0 +1,98 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+
+from tests.op_test import OpTest
+import paddle
+
+paddle.enable_static()
+
+
+# Correct: General.
+class TestUnsqueezeOp(OpTest):
+    def setUp(self):
+        self.set_metax_gpu()
+        self.op_type = "unsqueeze2"
+        self.dtype = "float32"
+        self.init_test_case()
+        self.inputs = {"X": np.random.random(self.ori_shape).astype(self.dtype)}
+        self.init_attrs()
+        self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)}
+
+    def set_metax_gpu(self):
+        self.__class__.use_custom_device = True
+        self.place = paddle.CustomPlace("metax_gpu", 0)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ["X"], "Out")
+
+    def init_test_case(self):
+        self.ori_shape = (3, 40)
+        self.axes = (1, 2)
+        self.new_shape = (3, 1, 1, 40)
+
+    def init_attrs(self):
+        self.attrs = {"axes": self.axes}
+
+
+# Correct: Single input index.
+class TestUnsqueezeOp1(TestUnsqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (20, 5)
+        self.axes = (-1,)
+        self.new_shape = (20, 5, 1)
+
+
+# Correct: Mixed input axis.
+class TestUnsqueezeOp2(TestUnsqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (20, 5)
+        self.axes = (0, -1)
+        self.new_shape = (1, 20, 5, 1)
+
+
+# Correct: There is duplicated axis.
+class TestUnsqueezeOp3(TestUnsqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (10, 2, 5)
+        self.axes = (0, 3, 3)
+        self.new_shape = (1, 10, 2, 1, 1, 5)
+
+
+# Correct: Reversed axes.
+class TestUnsqueezeOp4(TestUnsqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (10, 2, 5)
+        self.axes = (3, 1, 1)
+        self.new_shape = (10, 1, 1, 2, 5, 1)
+
+
+# test float16
+class TestUnsqueezeOp5(TestUnsqueezeOp):
+    def init_test_case(self):
+        self.dtype = "float16"
+        self.ori_shape = (10, 2, 5)
+        self.axes = (3, 1, 1)
+        self.new_shape = (10, 1, 1, 2, 5, 1)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 528ec55971cd8e115b3d0a7e2103bd4ebf7493a5 Mon Sep 17 00:00:00 2001
From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Date: Tue, 16 Sep 2025 11:39:34 +0800
Subject: [PATCH 011/121] [Metax] update metax CI CMakeLists (#16)

* [Metax] update metax CI

* [Metax] update metax CI CMakeLists
---
 backends/metax_gpu/tests/CMakeLists.txt | 44 +++++++++++++++----------
 1 file changed, 26 insertions(+), 18 deletions(-)

diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt
index 7e549ef4eaa..37475773026 100755
--- a/backends/metax_gpu/tests/CMakeLists.txt
+++ b/backends/metax_gpu/tests/CMakeLists.txt
@@ -87,24 +87,32 @@ list(
 list(
   REMOVE_ITEM
   PYTHON_TEST_SCRIPTS
-  ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_expand_v2_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_tril_triu_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_squared_l2_norm_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_scatter_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_einsum_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py)
+  ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py # 精度问题
+  ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py # 受 test_sum_op.py 影响
+  ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py # 精度问题
+  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py # core.cudnnversion
+                                                                   # 适配问题
+  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py # core.cudnnversion 适配问题
+  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py # core.cudnnversion 适配问题
+  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py # op_test.py 里
+                                                        # self._get_places()
+                                                        # 接口适配问题
+  ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py # device == "gpu" 适配问题
+  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py # paddle-gpu 报错一致
+  ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py # paddle-gpu 报错一致
+  ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py # core.cudnnversion 适配问题
+  ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py # paddle-gpu 报错一致
+  ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py # paddle-gpu 报错一致
+  ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py # paddle.device.cuda.get_device_properties
+  ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py # needs check_grad with fp64
+                                                    # precision
+  ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py # op_test.py 里
+                                                   # self._get_places() 接口适配问题
+  ${PADDLE_LEGACY_TEST_PATH}/test_slice_op.py # CUDAPinnedPlace 问题
+  ${PADDLE_LEGACY_TEST_PATH}/test_randint_op.py # paddle.device.cuda.get_device_properties
+  ${PADDLE_LEGACY_TEST_PATH}/test_compare_op.py # CUDAPinnedPlace 问题
+  ${PADDLE_LEGACY_TEST_PATH}/test_uniform_random_op.py # paddle.device.cuda.get_device_properties
+)
 
 list(REMOVE_DUPLICATES PYTHON_TEST_SCRIPTS)
 foreach(test_script ${PYTHON_TEST_SCRIPTS})

From 5b31405c13c32af5dbc826f7e8fec58e64a74322 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Tue, 16 Sep 2025 15:02:29 +0800
Subject: [PATCH 012/121] [Metax] add github action (#18)

* [Metax] add github action

---------

Co-authored-by: Mingkun.Zhang <2496808993@qq.com>
Co-authored-by: metax666 <metax_pde@outlook.com>
Co-authored-by: jiaxinWang-metax <189149612@qq.com>
Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Co-authored-by: chezhang <1376507468@qq.com>
Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com>
Co-authored-by: ZhouDuan <1184319564@qq.com>
---
 .github/workflows/metax_work.yaml | 52 +++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)
 create mode 100644 .github/workflows/metax_work.yaml

diff --git a/.github/workflows/metax_work.yaml b/.github/workflows/metax_work.yaml
new file mode 100644
index 00000000000..0d3d2637cdd
--- /dev/null
+++ b/.github/workflows/metax_work.yaml
@@ -0,0 +1,52 @@
+name: padlle metax gpu test
+
+on:
+  workflow_dispatch:
+  pull_request:
+    types: [opened, synchronize]
+    branches: [develop, release/**]
+    paths:
+      - "**"
+      - "!backends/**"
+      - "backends/metax_gpu/**"
+
+permissions: read-all
+
+defaults:
+  run:
+    shell: bash
+
+jobs:
+  metax-gpu-test:
+    runs-on: paddle-metax-runner-set
+    steps:
+      - name: Checkout repository
+        run: |
+          git config --global user.name "GitHub Actions"
+          git config --global user.email "actions@github.com"
+
+          if [ "${{ github.event_name }}" == "pull_request" ]; then
+            BRANCH_NAME=${{ github.head_ref }}
+          else
+            BRANCH_NAME=${{ github.ref_name }}
+          fi
+
+          git clone \
+            --reference-if-able /home/runner/PaddleCustomDevice \
+            --depth=1 \
+            --shallow-submodules \
+            --jobs=8 \
+            --branch $BRANCH_NAME \
+            --recurse-submodules \
+            https://${{ github.actor }}:${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }}.git .
+
+
+      - name: compile
+        run: |
+          cd backends/metax_gpu
+          bash build.sh
+
+      - name: run test
+        run: |
+          cd backends/metax_gpu/tests
+          bash run_test.sh

From b93c971b17729f09733faf5400d7ba44f1e5f3f2 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Tue, 16 Sep 2025 15:15:34 +0800
Subject: [PATCH 013/121] [metax] chang build (#19)

* [metax]chaneg build

---------

Co-authored-by: Mingkun.Zhang <2496808993@qq.com>
Co-authored-by: metax666 <metax_pde@outlook.com>
Co-authored-by: jiaxinWang-metax <189149612@qq.com>
Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Co-authored-by: chezhang <1376507468@qq.com>
Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com>
Co-authored-by: ZhouDuan <1184319564@qq.com>
---
 backends/metax_gpu/build.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/backends/metax_gpu/build.sh b/backends/metax_gpu/build.sh
index dd0ab3aab90..c288ea22312 100755
--- a/backends/metax_gpu/build.sh
+++ b/backends/metax_gpu/build.sh
@@ -20,7 +20,7 @@ set -e
 pip  uninstall paddlepaddle -y
 
 
-export http_proxy=http://10.2.192.21:1080 https_proxy=http://10.2.192.21:1080
+# export http_proxy=http://10.2.192.21:1080 https_proxy=http://10.2.192.21:1080
 pip install safetensors==0.6.2 -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple some-package
 # install paddle
 python -m pip install --pre paddlepaddle -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/
@@ -50,7 +50,7 @@ fi
 echo "make_maca"
 cd build
 cmake_maca .. -DPython3_EXECUTABLE=$(which python3) -DWITH_GPU=ON
-make_maca -j8
+make_maca -j60
 
 echo "install whl"
 pip install dist/paddle_metax_gpu*.whl --force-reinstall

From 6dbbe848d672a27bbbdded8e399ff5b1229c6647 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Tue, 16 Sep 2025 16:04:55 +0800
Subject: [PATCH 014/121] change_build (#20)

* [metax]chaneg build

---------
---
 backends/metax_gpu/build.sh | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/backends/metax_gpu/build.sh b/backends/metax_gpu/build.sh
index c288ea22312..e52cddc6476 100755
--- a/backends/metax_gpu/build.sh
+++ b/backends/metax_gpu/build.sh
@@ -20,15 +20,18 @@ set -e
 pip  uninstall paddlepaddle -y
 
 
-# export http_proxy=http://10.2.192.21:1080 https_proxy=http://10.2.192.21:1080
+# init paddle
+git submodule sync --recursive && git submodule update --init --recursive
+
+
+export http_proxy=https://172.17.0.1:1080 https_proxy=http://10.2.192.21:1080
 pip install safetensors==0.6.2 -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple some-package
 # install paddle
 python -m pip install --pre paddlepaddle -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/
 
 # exit 1
-# init paddle
-git submodule sync --recursive && git submodule update --init --recursive
 
+unset http_proxy https_proxy
 
 # apply patch
 bash change_patch.sh

From ef1b28e5d17ceac419de30f8ba129f16444bd39d Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Tue, 16 Sep 2025 16:18:54 +0800
Subject: [PATCH 015/121] change_build (#21)

---
 backends/metax_gpu/build.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/backends/metax_gpu/build.sh b/backends/metax_gpu/build.sh
index e52cddc6476..a40cac19e19 100755
--- a/backends/metax_gpu/build.sh
+++ b/backends/metax_gpu/build.sh
@@ -25,6 +25,7 @@ git submodule sync --recursive && git submodule update --init --recursive
 
 
 export http_proxy=https://172.17.0.1:1080 https_proxy=http://10.2.192.21:1080
+export
 pip install safetensors==0.6.2 -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple some-package
 # install paddle
 python -m pip install --pre paddlepaddle -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/

From 3737e488da962ae43cde4d51e495454a2818eb01 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Tue, 16 Sep 2025 16:24:15 +0800
Subject: [PATCH 016/121] change_build (#22)

* [Metax_change_ut]

* fix sum&collect_fpn_proposals op register

* modify profile

* [Metax] fix paddle bug replace 'MoeGradDispatchKernel' to 'MoeGateDispatchKernel'

* [Metax] register bce_loss_grad & bce_loss & index_add_grad kernels

* [Metax] con2d_grad use gpudnn

* blas handle support

* [Metax] register some kernels & update CMakeLists

* [Metax] fix metax unittest fail

* [Metax] add group_norm & label_smooth kernel and update matmul kernel

* [Metax] fix rmsprop kernel register and add meshgrid & meshgrid_grad kernel register

* add test

* add test

* [test]  chang the logic of workspace_host in cholesky_kernel_register

alloc(cpuplace,size), test pass
alloc(cpuplace, size, stream), crash

* [Metax] fix compile fail

* Revert "[Metax] fix compile fail"

This reverts commit 83bc87f686227962b0262e044225c6ed5507b824.

* [Metax] fix compile fail by 'conv_transpose_grad_kernel_impl.h'

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] con2d_grad use gpudnn

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] change_patch

* [Metax] update unit test CMakeLists.txt

* [Metax] update unit test CMakeLists.txt

* [feature] add unique_consecutive kernel

* [metax] add some kernel

* [metax] add some kernel

* [Metax] register baddbmm kernel & update blas api

* [Metax] register baddbmm kernel & update blas api

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [feature]  add add unique_consecutive kernel.cu

* [fix] fix some test case due to missing op register

* [fix]  fix some fail text

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax]fix lu eigvalshsqueeze rnn kernel

* add and fix some kernels

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [Metax] fix conflict

* [Metax] adapt to paddle-cpu-20250901 & resolve the issue of 'test_elementwise_mul_op_metax' failure

* [Metax] update repeat_interleave kernel & ignore max op test

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax] chang patch fix copy

* [metax] chang patch fix copy

* [Metax] update metax_gpu unit test

* [Metax] fix test CMakeList.txt

* [metax]change_cupti_and_fix_softmax

* [metax]change_patch

* [metax]change_patch

* [metax] updata_qr_kernel

* [metax] updata_qr_kernel

* [Metax] fix cufft and fix some blas kernel apply

* [metax] fix bug

* [Metax] add github action

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

---------

Co-authored-by: Mingkun.Zhang <2496808993@qq.com>
Co-authored-by: metax666 <metax_pde@outlook.com>
Co-authored-by: jiaxinWang-metax <189149612@qq.com>
Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Co-authored-by: chezhang <1376507468@qq.com>
Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com>
Co-authored-by: ZhouDuan <1184319564@qq.com>
---
 backends/metax_gpu/build.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/backends/metax_gpu/build.sh b/backends/metax_gpu/build.sh
index a40cac19e19..e3c4304e5f8 100755
--- a/backends/metax_gpu/build.sh
+++ b/backends/metax_gpu/build.sh
@@ -30,7 +30,6 @@ pip install safetensors==0.6.2 -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/
 # install paddle
 python -m pip install --pre paddlepaddle -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/
 
-# exit 1
 
 unset http_proxy https_proxy
 

From 16f35844e7218d0eb67aaffe6379c2a8820241e7 Mon Sep 17 00:00:00 2001
From: jxwangmetax <189149612@qq.com>
Date: Tue, 16 Sep 2025 16:52:30 +0800
Subject: [PATCH 017/121] =?UTF-8?q?=E3=80=90metax=E3=80=91modify=20cmake?=
 =?UTF-8?q?=20for=20warpctc=20and=20warprnnt=20(#17)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* modify cmake for warpctc and warprnnt

* modify conv for tf32 and fp32

* modify conv kernel
---
 backends/metax_gpu/CMakeLists.txt             |  4 +-
 backends/metax_gpu/cmake/warpctc.cmake        |  7 +-
 backends/metax_gpu/cmake/warprnnt.cmake       |  8 ++-
 .../fused_conv2d_add_act_kernel_register.cu   |  2 +-
 .../conv_grad_kernel_register.cu              | 42 ++++++++++--
 .../kernels/gpudnn/conv_kernel_register.cu    |  2 +-
 .../kernels/gpudnn/conv_transpose_kernel.cu   |  2 +-
 backends/metax_gpu/kernels/impl/warpctc.h     | 64 -------------------
 .../kernels/impl/warpctc_grad_kernel_impl.h   |  2 +-
 .../kernels/impl/warpctc_kernel_impl.h        | 16 ++---
 backends/metax_gpu/kernels/impl/warprnnt.h    | 63 ------------------
 .../kernels/impl/warprnnt_kernel_impl.h       | 14 ++--
 backends/metax_gpu/kernels/metax_context.cc   | 20 +++++-
 backends/metax_gpu/kernels/metax_context.h    |  1 +
 14 files changed, 88 insertions(+), 159 deletions(-)
 rename backends/metax_gpu/kernels/{cuda_kernels => gpudnn}/conv_grad_kernel_register.cu (98%)
 delete mode 100644 backends/metax_gpu/kernels/impl/warpctc.h
 delete mode 100644 backends/metax_gpu/kernels/impl/warprnnt.h

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index cca23ab42f5..787aae13e40 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -736,7 +736,7 @@ add_library(
 target_include_directories(
   ${TARGET_NAME}
   PRIVATE ${PADDLE_SOURCE_DIR} ${CMAKE_SOURCE_DIR} ${CMAKE_SOURCE_DIR}/kernels
-          ${CUDA_INCLUDE_DIRS} ${PADDLE_SOURCE_DIR}/third_party/pybind/include
+          ${CUDA_INCLUDE_DIRS} ${WARPCTC_INCLUDE_DIR} ${WARPRNNT_INCLUDE_DIR} ${PADDLE_SOURCE_DIR}/third_party/pybind/include
           ${PADDLE_SOURCE_DIR}/paddle/phi/api/include/compat)
 
 target_link_libraries(
@@ -749,6 +749,8 @@ target_link_libraries(
   protobuf
   external_error_proto
   dgc
+  ${WARPCTC_LIBRARIES}
+  ${WARPRNNT_LIBRARIES}
   ${PADDLE_CORE_LIB})
 target_link_libraries(${TARGET_NAME} /opt/maca/lib/libmccl.so)
 target_link_libraries(${TARGET_NAME} /opt/maca/lib/libmcFlashAttn.so)
diff --git a/backends/metax_gpu/cmake/warpctc.cmake b/backends/metax_gpu/cmake/warpctc.cmake
index 71c892a6cfa..9edc92f0a94 100644
--- a/backends/metax_gpu/cmake/warpctc.cmake
+++ b/backends/metax_gpu/cmake/warpctc.cmake
@@ -145,5 +145,8 @@ get_filename_component(WARPCTC_LIBRARY_PATH ${WARPCTC_LIBRARIES} DIRECTORY)
 include_directories(${WARPCTC_INCLUDE_DIR}) # For warpctc code to include its
                                             # headers.
 
-add_library(warpctc INTERFACE)
-add_dependencies(warpctc extern_warpctc)
+add_library(warpctc SHARED IMPORTED GLOBAL)
+set_target_properties(warpctc PROPERTIES
+    IMPORTED_LOCATION ${WARPCTC_LIBRARIES}
+    INTERFACE_INCLUDE_DIRECTORIES ${WARPCTC_INCLUDE_DIR}
+)
\ No newline at end of file
diff --git a/backends/metax_gpu/cmake/warprnnt.cmake b/backends/metax_gpu/cmake/warprnnt.cmake
index 54a7ad6be86..527f2e55a1b 100644
--- a/backends/metax_gpu/cmake/warprnnt.cmake
+++ b/backends/metax_gpu/cmake/warprnnt.cmake
@@ -137,6 +137,8 @@ get_filename_component(WARPRNNT_LIBRARY_PATH ${WARPRNNT_LIBRARIES} DIRECTORY)
 include_directories(${WARPRNNT_INCLUDE_DIR}) # For warprnnt code to include its
                                              # headers.
 
-add_library(warprnnt INTERFACE)
-# set_property(TARGET warprnnt PROPERTY IMPORTED_LOCATION ${WARPRNNT_LIBRARIES})
-add_dependencies(warprnnt extern_warprnnt)
+add_library(warprnnt SHARED IMPORTED GLOBAL)
+set_target_properties(warprnnt PROPERTIES
+    IMPORTED_LOCATION ${WARPRNNT_LIBRARIES}
+    INTERFACE_INCLUDE_DIRECTORIES ${WARPRNNT_INCLUDE_DIR}
+)
\ No newline at end of file
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_conv2d_add_act_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/fused_conv2d_add_act_kernel_register.cu
index ee4f105cbc5..48809ceefa4 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/fused_conv2d_add_act_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/fused_conv2d_add_act_kernel_register.cu
@@ -308,7 +308,7 @@ class CudnnConvDescManager {
       int groups,
       cudnnDataType_t dtype) {
     auto* desc = new phi::backends::gpu::ConvolutionDescriptor();
-    desc->set(dtype, paddings, strides, dilations, true, groups);
+    desc->set(dtype, paddings, strides, dilations, phi::AllowTF32Cudnn(), groups);
     return desc;
   }
 
diff --git a/backends/metax_gpu/kernels/cuda_kernels/conv_grad_kernel_register.cu b/backends/metax_gpu/kernels/gpudnn/conv_grad_kernel_register.cu
similarity index 98%
rename from backends/metax_gpu/kernels/cuda_kernels/conv_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/gpudnn/conv_grad_kernel_register.cu
index 885137675b4..e4acb2f95b6 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/conv_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/gpudnn/conv_grad_kernel_register.cu
@@ -161,7 +161,12 @@ void ConvCudnnGradKernelImplV7(
     args1.idesc.set(*transformed_input_grad, layout_tensor);
     args1.wdesc.set(*transformed_filter_channel, layout_tensor, iwo_groups);
     args1.odesc.set(*transformed_output_grad_channel, layout_tensor);
-    args1.cdesc.set(dtype, padding_common, strides, dilations, true, c_groups);
+    args1.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations,
+                    phi::AllowTF32Cudnn(),
+                    c_groups);
 
 #ifdef PADDLE_WITH_HIP
     using search1 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
@@ -184,7 +189,12 @@ void ConvCudnnGradKernelImplV7(
     args2.wdesc.set(
         *transformed_filter_grad_channel, layout_tensor, iwo_groups);
     args2.odesc.set(*transformed_output_grad_channel, layout_tensor);
-    args2.cdesc.set(dtype, padding_common, strides, dilations, true, c_groups);
+    args2.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations,
+                    phi::AllowTF32Cudnn(),
+                    c_groups);
 #ifdef PADDLE_WITH_HIP
     using search2 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
     workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2));
@@ -1073,7 +1083,12 @@ void ConvCudnnGradGradKernel(
       args1.idesc.set(transformed_ddX, iwo_group);
       args1.wdesc.set(*W, layout, iwo_group);
       args1.odesc.set(transformed_ddO_channel, iwo_group);
-      args1.cdesc.set(dtype, padding_common, strides, dilations, true, c_group);
+      args1.cdesc.set(dtype,
+                      padding_common,
+                      strides,
+                      dilations,
+                      phi::AllowTF32Cudnn(),
+                      c_group);
 
 #ifdef PADDLE_WITH_HIP
       using search1 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
@@ -1092,7 +1107,12 @@ void ConvCudnnGradGradKernel(
       args2.idesc.set(transformed_X, iwo_group);
       args2.wdesc.set(*ddW, layout, iwo_group);
       args2.odesc.set(transformed_ddO_channel, iwo_group);
-      args2.cdesc.set(dtype, padding_common, strides, dilations, true, c_group);
+      args2.cdesc.set(dtype,
+                      padding_common,
+                      strides,
+                      dilations,
+                      phi::AllowTF32Cudnn(),
+                      c_group);
 
 #ifdef PADDLE_WITH_HIP
       using search2 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
@@ -1114,7 +1134,12 @@ void ConvCudnnGradGradKernel(
     args3.idesc.set(transformed_ddX, iwo_group);
     args3.wdesc.set(*dW, layout, iwo_group);
     args3.odesc.set(transformed_dO_channel, iwo_group);
-    args3.cdesc.set(dtype, padding_common, strides, dilations, true, c_group);
+    args3.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations,
+                    phi::AllowTF32Cudnn(),
+                    c_group);
 
 #ifdef PADDLE_WITH_HIP
     using search3 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
@@ -1136,7 +1161,12 @@ void ConvCudnnGradGradKernel(
     args4.idesc.set(transformed_dX, iwo_group);
     args4.wdesc.set(*ddW, layout, iwo_group);
     args4.odesc.set(transformed_dO_channel, iwo_group);
-    args4.cdesc.set(dtype, padding_common, strides, dilations, true, c_group);
+    args4.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations,
+                    phi::AllowTF32Cudnn(),
+                    c_group);
 
 #ifdef PADDLE_WITH_HIP
     using search4 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
diff --git a/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu b/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu
index bdff5fa9f93..bf129fed05c 100644
--- a/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu
+++ b/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu
@@ -81,7 +81,7 @@ void ConvCudnnKernelImplV7(const DenseTensor* transformed_input,
   args.cdesc.set(
       dtype, padding_common, strides, dilations, phi::AllowTF32Cudnn(), groups);
 #else
-  args.cdesc.set(dtype, padding_common, strides, dilations, true);
+  args.cdesc.set(dtype, padding_common, strides, dilations, phi::AllowTF32Cudnn());
 #endif
 
 #if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1)
diff --git a/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu b/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu
index aa1cc80d06d..928201c705f 100644
--- a/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu
+++ b/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu
@@ -93,7 +93,7 @@ void ConvTransposeCudnnKernelImplV7(const DenseTensor* transformed_x,
   args.idesc.set(*transformed_out, iwo_groups);
   args.wdesc.set(*filter, layout_tensor, iwo_groups);
   args.odesc.set(*transformed_x, iwo_groups);
-  args.cdesc.set(dtype, padding_common, strides, dilations_, false, c_groups);
+  args.cdesc.set(dtype, padding_common, strides, dilations_, phi::AllowTF32Cudnn(), c_groups);
 
 #ifdef PADDLE_WITH_HIP
   SearchResult<miopenConvBwdDataAlgorithm_t> bwd_result;
diff --git a/backends/metax_gpu/kernels/impl/warpctc.h b/backends/metax_gpu/kernels/impl/warpctc.h
deleted file mode 100644
index ba5da472ade..00000000000
--- a/backends/metax_gpu/kernels/impl/warpctc.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <mutex>  // NOLINT
-
-#include "paddle/phi/backends/dynload/dynamic_loader.h"
-#include "paddle/phi/common/port.h"
-#include "third_party/warpctc/include/ctc.h"
-
-namespace phi {
-namespace dynload {
-
-extern std::once_flag warpctc_dso_flag;
-extern void* warpctc_dso_handle;
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load warpctc routine
- * via operator overloading.
- */
-#define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                            \
-  struct DynLoad__##__name {                                         \
-    template <typename... Args>                                      \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
-      using warpctcFunc = decltype(&::__name);                       \
-      std::call_once(warpctc_dso_flag, []() {                        \
-        warpctc_dso_handle = phi::dynload::GetWarpCTCDsoHandle();    \
-      });                                                            \
-      static void* p_##__name = dlsym(warpctc_dso_handle, #__name);  \
-      return reinterpret_cast<warpctcFunc>(p_##__name)(args...);     \
-    }                                                                \
-  };                                                                 \
-  extern DynLoad__##__name __name
-
-#define DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP(__name) \
-  DYNAMIC_LOAD_WARPCTC_WRAP(__name)
-
-#define WARPCTC_ROUTINE_EACH(__macro) \
-  __macro(get_warpctc_version);       \
-  __macro(ctcGetStatusString);        \
-  __macro(compute_ctc_loss);          \
-  __macro(compute_ctc_loss_double);   \
-  __macro(get_workspace_size);        \
-  __macro(get_workspace_size_double)
-
-WARPCTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP);
-
-#undef DYNAMIC_LOAD_WARPCTC_WRAP
-
-}  // namespace dynload
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/impl/warpctc_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/warpctc_grad_kernel_impl.h
index 51f4ce86890..dc9bc376e63 100644
--- a/backends/metax_gpu/kernels/impl/warpctc_grad_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/warpctc_grad_kernel_impl.h
@@ -16,7 +16,7 @@
 
 #include <vector>
 
-#include "kernels/impl/warpctc.h"
+#include "third_party/warpctc/include/ctc.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
diff --git a/backends/metax_gpu/kernels/impl/warpctc_kernel_impl.h b/backends/metax_gpu/kernels/impl/warpctc_kernel_impl.h
index 9794ba1b3c0..e0b15feca03 100644
--- a/backends/metax_gpu/kernels/impl/warpctc_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/warpctc_kernel_impl.h
@@ -16,7 +16,7 @@
 
 #include <vector>
 
-#include "kernels/impl/warpctc.h"
+#include "third_party/warpctc/include/ctc.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/lod_utils.h"
 #include "paddle/phi/core/tensor_utils.h"
@@ -58,7 +58,7 @@ class ComputeCtcLossFunctor<Context, float> {
                          float* costs,
                          void* workspace,
                          ctcOptions options) {
-    return phi::dynload::compute_ctc_loss(activations,
+    return compute_ctc_loss(activations,
                                           gradients,
                                           flat_labels,
                                           label_lengths,
@@ -84,7 +84,7 @@ class ComputeCtcLossFunctor<Context, double> {
                          double* costs,
                          void* workspace,
                          ctcOptions options) {
-    return phi::dynload::compute_ctc_loss_double(
+    return compute_ctc_loss_double(
         activations,
         gradients,
         flat_labels,
@@ -141,14 +141,14 @@ class WarpCTCFunctor {
     ctcStatus_t status = CTC_STATUS_UNKNOWN_ERROR;
     if (sizeof(T) == 4) {
       status =
-          phi::dynload::get_workspace_size(cpu_label_lengths,
+          get_workspace_size(cpu_label_lengths,
                                            cpu_input_lengths,
                                            static_cast<int>(sequence_width),
                                            static_cast<int>(num_sequences),
                                            options_,
                                            &workspace_bytes);
     } else {
-      status = phi::dynload::get_workspace_size_double(
+      status = get_workspace_size_double(
           cpu_label_lengths,
           cpu_input_lengths,
           static_cast<int>(sequence_width),
@@ -162,7 +162,7 @@ class WarpCTCFunctor {
         errors::PreconditionNotMet(
             "warp-ctc [version %d] Error in get_workspace_size: %s",
             warpctc_version_,
-            phi::dynload::ctcGetStatusString(status)));
+            ctcGetStatusString(status)));
     PADDLE_ENFORCE_GT(
         workspace_bytes,
         0UL,
@@ -197,12 +197,12 @@ class WarpCTCFunctor {
         errors::PreconditionNotMet(
             "warp-ctc [version %d] Error in get_workspace_size: %s",
             warpctc_version_,
-            phi::dynload::ctcGetStatusString(status)));
+            ctcGetStatusString(status)));
   }
 
  protected:
   void init(const Context& dev_ctx, const size_t blank) {
-    warpctc_version_ = phi::dynload::get_warpctc_version();
+    warpctc_version_ = get_warpctc_version();
 
     if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU ||
         dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM) {
diff --git a/backends/metax_gpu/kernels/impl/warprnnt.h b/backends/metax_gpu/kernels/impl/warprnnt.h
deleted file mode 100644
index 50b0dfc0efc..00000000000
--- a/backends/metax_gpu/kernels/impl/warprnnt.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <mutex>  // NOLINT
-
-#include "paddle/phi/backends/dynload/dynamic_loader.h"
-#include "paddle/phi/common/port.h"
-#include "third_party/warprnnt/include/rnnt.h"
-
-namespace phi {
-namespace dynload {
-
-extern std::once_flag warprnnt_dso_flag;
-extern void* warprnnt_dso_handle;
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load warprnnt routine
- * via operator overloading.
- */
-#define DYNAMIC_LOAD_WARPRNNT_WRAP(__name)                           \
-  struct DynLoad__##__name {                                         \
-    template <typename... Args>                                      \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
-      using warprnntFunc = decltype(&::__name);                      \
-      std::call_once(warprnnt_dso_flag, []() {                       \
-        warprnnt_dso_handle = phi::dynload::GetWarpRNNTDsoHandle();  \
-      });                                                            \
-      static void* p_##__name = dlsym(warprnnt_dso_handle, #__name); \
-      return reinterpret_cast<warprnntFunc>(p_##__name)(args...);    \
-    }                                                                \
-  };                                                                 \
-  extern DynLoad__##__name __name
-
-#define DECLARE_DYNAMIC_LOAD_WARPRNNT_WRAP(__name) \
-  DYNAMIC_LOAD_WARPRNNT_WRAP(__name)
-
-#define WARPRNNT_ROUTINE_EACH(__macro) \
-  __macro(get_warprnnt_version);       \
-  __macro(rnntGetStatusString);        \
-  __macro(compute_rnnt_loss);          \
-  __macro(compute_rnnt_loss_fp64);     \
-  __macro(get_rnnt_workspace_size);
-
-WARPRNNT_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_WARPRNNT_WRAP);
-
-#undef DYNAMIC_LOAD_WARPRNNT_WRAP
-
-}  // namespace dynload
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/impl/warprnnt_kernel_impl.h b/backends/metax_gpu/kernels/impl/warprnnt_kernel_impl.h
index bb4311f5912..457fdcb9bff 100644
--- a/backends/metax_gpu/kernels/impl/warprnnt_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/warprnnt_kernel_impl.h
@@ -16,7 +16,7 @@
 
 #include <vector>
 
-#include "kernels/impl/warprnnt.h"
+#include "third_party/warprnnt/include/rnnt.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/empty_kernel.h"
@@ -55,7 +55,7 @@ class ComputeRnntLossFunctor<Context, float> {
                           float* costs,
                           void* workspace,
                           rnntOptions options) {
-    return phi::dynload::compute_rnnt_loss(activations,
+    return compute_rnnt_loss(activations,
                                            gradients,
                                            label,
                                            label_lengths,
@@ -81,7 +81,7 @@ class ComputeRnntLossFunctor<Context, double> {
                           double* costs,
                           void* workspace,
                           rnntOptions options) {
-    return phi::dynload::compute_rnnt_loss_fp64(activations,
+    return compute_rnnt_loss_fp64(activations,
                                                 gradients,
                                                 label,
                                                 label_lengths,
@@ -149,7 +149,7 @@ class WarpRNNTFunctor {
     }
 
     size_t workspace_bytes = 0;
-    status = phi::dynload::get_rnnt_workspace_size(
+    status = get_rnnt_workspace_size(
         maxT, maxU, B, gpu, &workspace_bytes, sizeof(T));
 
     PADDLE_ENFORCE_EQ(
@@ -158,7 +158,7 @@ class WarpRNNTFunctor {
         errors::PreconditionNotMet(
             "warp-rnnt [version %d] Error in get_rnnt_workspace_size: %s",
             warprnnt_version_,
-            phi::dynload::rnntGetStatusString(status)));
+            rnntGetStatusString(status)));
     PADDLE_ENFORCE_GT(
         workspace_bytes,
         0UL,
@@ -190,7 +190,7 @@ class WarpRNNTFunctor {
         errors::PreconditionNotMet(
             "warp-rnnt [version %d] Error in get_workspace_size: %s",
             warprnnt_version_,
-            phi::dynload::rnntGetStatusString(status)));
+            rnntGetStatusString(status)));
   }
 
  protected:
@@ -200,7 +200,7 @@ class WarpRNNTFunctor {
             const size_t blank,
             const float fastemit_lambda,
             const int num_threads) {
-    warprnnt_version_ = phi::dynload::get_warprnnt_version();
+    warprnnt_version_ = get_warprnnt_version();
 
     options_.maxT = maxT;
     options_.maxU = maxU;
diff --git a/backends/metax_gpu/kernels/metax_context.cc b/backends/metax_gpu/kernels/metax_context.cc
index 4df4d88b0b4..f0c92f00565 100644
--- a/backends/metax_gpu/kernels/metax_context.cc
+++ b/backends/metax_gpu/kernels/metax_context.cc
@@ -15,7 +15,25 @@
 #include "kernels/metax_context.h"
 
 namespace phi {
-bool AllowTF32Cudnn() { return false; }
+const bool allow_tf32_cublas = []() -> bool {
+    const char* v = std::getenv("ALLOW_TF32_CUBLAS");
+    if (v) {
+      return std::atoi(v);
+    }
+    return false;
+}();
+
+const bool allow_tf32_cudnn = []() -> bool {
+    const char* v = std::getenv("ALLOW_TF32_CUDNN");
+    if (v) {
+      return std::atoi(v);
+    }
+    return false;
+}();
+
+bool AllowTF32Cublas() { return allow_tf32_cublas; }
+bool AllowTF32Cudnn()  { return allow_tf32_cudnn;  }
+
 void DnnWorkspaceHandle::RunFuncSync(
     const std::function<void(void*)>& cudnn_func,
     size_t required_workspace_bytes,
diff --git a/backends/metax_gpu/kernels/metax_context.h b/backends/metax_gpu/kernels/metax_context.h
index 5974aadcc41..683a6df7017 100644
--- a/backends/metax_gpu/kernels/metax_context.h
+++ b/backends/metax_gpu/kernels/metax_context.h
@@ -128,6 +128,7 @@ inline void InitCusolverDnHandle(cusolverDnHandle_t* handle,
   }
 }
 
+bool AllowTF32Cublas();
 bool AllowTF32Cudnn();
 inline cusolverDnHandle_t GetCusolverDnHandle(gpuStream_t stream, Place place) {
   std::call_once(flag_cusolver_dn_, [&]() {

From ce54693240221505b150900fb601e640181a5620 Mon Sep 17 00:00:00 2001
From: jxwangmetax <189149612@qq.com>
Date: Tue, 16 Sep 2025 18:12:37 +0800
Subject: [PATCH 018/121] [metax]modify library to static library (#24)

* modify cmake for warpctc and warprnnt

* modify conv for tf32 and fp32

* modify conv kernel

* modify library to static library
---
 backends/metax_gpu/cmake/warpctc.cmake  | 19 +++++++++----------
 backends/metax_gpu/cmake/warprnnt.cmake | 19 +++++++++----------
 2 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/backends/metax_gpu/cmake/warpctc.cmake b/backends/metax_gpu/cmake/warpctc.cmake
index 9edc92f0a94..0733c0f9ce5 100644
--- a/backends/metax_gpu/cmake/warpctc.cmake
+++ b/backends/metax_gpu/cmake/warpctc.cmake
@@ -66,11 +66,11 @@ set(WARPCTC_LIB_DIR
 
 if(WIN32)
   set(WARPCTC_LIBRARIES
-      "${WARPCTC_INSTALL_DIR}/bin/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}"
+      "${WARPCTC_INSTALL_DIR}/bin/warpctc${CMAKE_STATIC_LIBRARY_SUFFIX}"
       CACHE FILEPATH "Warp-ctc Library" FORCE)
 else()
   set(WARPCTC_LIBRARIES
-      "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_SHARED_LIBRARY_SUFFIX}"
+      "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_STATIC_LIBRARY_SUFFIX}"
       CACHE FILEPATH "Warp-ctc Library" FORCE)
 endif()
 
@@ -93,10 +93,10 @@ if(WIN32)
   set(WARPCTC_CXX_FLAGS_DEBUG
       $<FILTER:${CMAKE_CXX_FLAGS_DEBUG},EXCLUDE,/Zc:inline>)
 else()
-  set(WARPCTC_C_FLAGS ${CMAKE_C_FLAGS})
+  set(WARPCTC_C_FLAGS "${CMAKE_C_FLAGS} -fPIC")
   set(WARPCTC_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
   set(WARPCTC_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE})
-  set(WARPCTC_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+  set(WARPCTC_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
   set(WARPCTC_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
   set(WARPCTC_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
 endif()
@@ -127,7 +127,7 @@ ExternalProject_Add(
              -DNVCC_FLAGS_EXTRA=${NVCC_FLAGS_EXTRA}
              -DWITH_TORCH=OFF
              -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
-             -DBUILD_SHARED=ON
+             -DBUILD_SHARED=OFF
              -DBUILD_TESTS=OFF
              -DCMAKE_POSITION_INDEPENDENT_CODE=ON
              -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
@@ -145,8 +145,7 @@ get_filename_component(WARPCTC_LIBRARY_PATH ${WARPCTC_LIBRARIES} DIRECTORY)
 include_directories(${WARPCTC_INCLUDE_DIR}) # For warpctc code to include its
                                             # headers.
 
-add_library(warpctc SHARED IMPORTED GLOBAL)
-set_target_properties(warpctc PROPERTIES
-    IMPORTED_LOCATION ${WARPCTC_LIBRARIES}
-    INTERFACE_INCLUDE_DIRECTORIES ${WARPCTC_INCLUDE_DIR}
-)
\ No newline at end of file
+add_library(warpctc STATIC IMPORTED GLOBAL)
+set_target_properties(
+  warpctc PROPERTIES IMPORTED_LOCATION ${WARPCTC_LIBRARIES}
+                     INTERFACE_INCLUDE_DIRECTORIES ${WARPCTC_INCLUDE_DIR})
diff --git a/backends/metax_gpu/cmake/warprnnt.cmake b/backends/metax_gpu/cmake/warprnnt.cmake
index 527f2e55a1b..a8d6683af2b 100644
--- a/backends/metax_gpu/cmake/warprnnt.cmake
+++ b/backends/metax_gpu/cmake/warprnnt.cmake
@@ -62,11 +62,11 @@ set(WARPRNNT_LIB_DIR
 
 if(WIN32)
   set(WARPRNNT_LIBRARIES
-      "${WARPRNNT_INSTALL_DIR}/bin/warprnnt${CMAKE_SHARED_LIBRARY_SUFFIX}"
+      "${WARPRNNT_INSTALL_DIR}/bin/warprnnt${CMAKE_STATIC_LIBRARY_SUFFIX}"
       CACHE FILEPATH "Warp-rnnt Library" FORCE)
 else()
   set(WARPRNNT_LIBRARIES
-      "${WARPRNNT_INSTALL_DIR}/lib/libwarprnnt${CMAKE_SHARED_LIBRARY_SUFFIX}"
+      "${WARPRNNT_INSTALL_DIR}/lib/libwarprnnt${CMAKE_STATIC_LIBRARY_SUFFIX}"
       CACHE FILEPATH "Warp-rnnt Library" FORCE)
 endif()
 
@@ -90,10 +90,10 @@ if(WIN32)
   set(WARPRNNT_CXX_FLAGS_DEBUG
       $<FILTER:${CMAKE_CXX_FLAGS_DEBUG},EXCLUDE,/Zc:inline>)
 else()
-  set(WARPRNNT_C_FLAGS ${CMAKE_C_FLAGS})
+  set(WARPRNNT_C_FLAGS "${CMAKE_C_FLAGS} -fPIC")
   set(WARPRNNT_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
   set(WARPRNNT_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE})
-  set(WARPRNNT_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+  set(WARPRNNT_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
   set(WARPRNNT_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
   set(WARPRNNT_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
 endif()
@@ -120,7 +120,7 @@ ExternalProject_Add(
              -DWITH_ROCM=${WITH_ROCM}
              -DWITH_OMP=${USE_OMP}
              -DNVCC_FLAGS_EXTRA=${NVCC_FLAGS_EXTRA}
-             -DBUILD_SHARED=ON
+             -DBUILD_SHARED=OFF
              -DBUILD_TESTS=OFF
              -DCMAKE_POSITION_INDEPENDENT_CODE=ON
              -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
@@ -137,8 +137,7 @@ get_filename_component(WARPRNNT_LIBRARY_PATH ${WARPRNNT_LIBRARIES} DIRECTORY)
 include_directories(${WARPRNNT_INCLUDE_DIR}) # For warprnnt code to include its
                                              # headers.
 
-add_library(warprnnt SHARED IMPORTED GLOBAL)
-set_target_properties(warprnnt PROPERTIES
-    IMPORTED_LOCATION ${WARPRNNT_LIBRARIES}
-    INTERFACE_INCLUDE_DIRECTORIES ${WARPRNNT_INCLUDE_DIR}
-)
\ No newline at end of file
+add_library(warprnnt STATIC IMPORTED GLOBAL)
+set_target_properties(
+  warprnnt PROPERTIES IMPORTED_LOCATION ${WARPRNNT_LIBRARIES}
+                      INTERFACE_INCLUDE_DIRECTORIES ${WARPRNNT_INCLUDE_DIR})

From 4cda637ff68d88adfd88c322d4d55c9d7dd15397 Mon Sep 17 00:00:00 2001
From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Date: Tue, 16 Sep 2025 18:14:09 +0800
Subject: [PATCH 019/121] [Metax] organize documents (#25)

* [Metax] fix dgc & mklml compile product path problem

* [Metax] update metax_gpu CMakeLists.txt

* [Metax] organize documents
---
 .../calc_reduced_attn_kernel_register.cu      |   2 +-
 backends/metax_gpu/kernels/funcs/softmax.cu   |   2 +-
 .../kernels/funcs/values_vectors_functor.h    |   2 +-
 .../metax_gpu/kernels/gpudnn/conv_cudnn_v7.h  |   2 +-
 .../conv_transpose_grad_kernel_register.cu    |   2 +-
 .../kernels/gpudnn/pool_kernel_register.cu    |   2 +-
 .../metax_gpu/kernels/gpudnn/softmax_gpudnn.h |   2 +-
 .../kernels/impl/dirichlet_kernel_impl.h      |   2 +-
 .../addmm_grad_kernel_register.cu             |   0
 .../addmm_kernel_register.cu                  |   0
 .../batch_fc_grad_kernel_register.cu          |   0
 .../batch_norm_grad_kernel_register.cu        |   2 +-
 .../batch_norm_kernel_register.cu             |   0
 .../bilinear_grad_kernel_register.cu          |   0
 .../bilinear_kernel_register.cu               |   0
 .../metax_kernel/blha_get_max_len_register.cu |   2 +-
 .../bmm_grad_kernel_register.cu               |   0
 .../bmm_kernel_register.cu                    |   0
 ...abel_cross_entropy_grad_kernel_register.cu |   0
 .../cholesky_grad_kernel_register.cu          |   0
 .../metax_kernel/cholesky_kernel_register.cu  |   2 +-
 .../conv_kernel_register.cu                   |   0
 .../conv_transpose_kernel_register.cu         |   0
 .../crop_kernel_register.cu                   |   0
 .../cross_entropy_kernel_register.cu          |   2 +-
 .../depthwise_conv_grad_kernel.cu             |   0
 .../depthwise_conv_kernel.cu                  |   0
 .../kernels/{ => metax_kernel}/elementwise.h  |   0
 .../{ => metax_kernel}/flags_declare.cu       |   0
 .../flash_attn_grad_kernel.cu                 |   0
 .../{ => metax_kernel}/flash_attn_kernel.cu   |   0
 .../{ => metax_kernel}/flash_attn_kernel.h    |   0
 .../{ => metax_kernel}/flash_attn_utils.h     |   0
 .../kernels/{ => metax_kernel}/flashattn.cc   |   0
 .../kernels/{ => metax_kernel}/flashattn.h    |   0
 .../flatten2_grad_kernel_register.cu          |   0
 .../flatten2_kernel_register.cu               |   0
 .../fused_conv2d_add_act_kernel_register.cu   |   3 +-
 .../fused_rope_grad_kernel_register.cu        |   0
 .../fused_rope_kernel_register.cu             |   0
 .../instance_norm_grad_kerne_registerl.cu     |   2 +-
 .../instance_norm_kernel_register.cu          |   2 +-
 .../layer_norm_grad_kernel_register.cu        |   0
 .../layer_norm_kernel_register.cu             |   0
 .../lstm_kernel_register.cu                   |   0
 .../metax_kernel/lu_kernel_register.cu        |   2 +-
 .../lu_solve_grad_kernel_register.cu          |   0
 .../metax_kernel/matrix_rank_tol_kernel.cu    |   2 +-
 .../{ => metax_kernel}/metax_context.cc       |  24 +--
 .../{ => metax_kernel}/metax_context.h        |   6 +-
 .../multi_dot_grad_kernel_register.cu         |   0
 .../multi_dot_kernel_register.cu              |   0
 .../mv_grad_kernel_register.cu                |   0
 .../mv_kernel_register.cu                     |   0
 .../metax_kernel/qr_kernel_register.cu        |   2 +-
 .../rank_attention_grad_kernel_register.cu    |   0
 .../rank_attention_kernel_register.cu         |   0
 .../metax_kernel/rnn_grad_kernel.cu.cc        |   2 +-
 .../kernels/metax_kernel/rnn_kernel.cu.cc     |   2 +-
 .../slogdeterminant_kernel_register.cu        |   0
 .../softmax_kernel_grad_register.cu           |   0
 .../softmax_kernel_register.cu                |   0
 .../solve_grad_kernel_register.cu             |   0
 .../standard_gamma_kernel_register.cu         |   0
 .../stft_kernel_register.cu                   |   0
 .../svd_kernel_register.cu                    |   0
 .../top_k_grad_kernel_register.cu             |   0
 .../triangular_solve_grad_kernel_register.cu  |   0
 .../triangular_solve_kernel_register.cu       |   0
 .../warprnnt_kernel_register.cu               |   0
 .../weight_only_linear_kernel.cu              |   0
 .../weight_quantize_kernel_register.cu        |   0
 backends/metax_gpu/patch/paddle.patch         | 204 +++++++++---------
 backends/metax_gpu/tests/CMakeLists.txt       |  54 ++---
 74 files changed, 166 insertions(+), 163 deletions(-)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/addmm_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/addmm_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/batch_fc_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/batch_norm_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/bilinear_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/bilinear_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/bmm_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/bmm_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/c_softmax_with_multi_label_cross_entropy_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/cholesky_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/conv_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/conv_transpose_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/crop_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/depthwise_conv_grad_kernel.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/depthwise_conv_kernel.cu (100%)
 rename backends/metax_gpu/kernels/{ => metax_kernel}/elementwise.h (100%)
 rename backends/metax_gpu/kernels/{ => metax_kernel}/flags_declare.cu (100%)
 rename backends/metax_gpu/kernels/{ => metax_kernel}/flash_attn_grad_kernel.cu (100%)
 rename backends/metax_gpu/kernels/{ => metax_kernel}/flash_attn_kernel.cu (100%)
 rename backends/metax_gpu/kernels/{ => metax_kernel}/flash_attn_kernel.h (100%)
 rename backends/metax_gpu/kernels/{ => metax_kernel}/flash_attn_utils.h (100%)
 rename backends/metax_gpu/kernels/{ => metax_kernel}/flashattn.cc (100%)
 rename backends/metax_gpu/kernels/{ => metax_kernel}/flashattn.h (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/flatten2_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/flatten2_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/fused_conv2d_add_act_kernel_register.cu (99%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/fused_rope_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/fused_rope_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{ => metax_kernel}/layer_norm_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{ => metax_kernel}/layer_norm_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/lstm_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/lu_solve_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{ => metax_kernel}/metax_context.cc (90%)
 rename backends/metax_gpu/kernels/{ => metax_kernel}/metax_context.h (96%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/multi_dot_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/multi_dot_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/mv_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/mv_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/rank_attention_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/rank_attention_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/slogdeterminant_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/softmax_kernel_grad_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/softmax_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/solve_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/standard_gamma_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/stft_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/svd_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/top_k_grad_kernel_register.cu (100%)
 mode change 100755 => 100644
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/triangular_solve_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/triangular_solve_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/warprnnt_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/weight_only_linear_kernel.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/weight_quantize_kernel_register.cu (100%)

diff --git a/backends/metax_gpu/kernels/cuda_kernels/calc_reduced_attn_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/calc_reduced_attn_kernel_register.cu
index 11def2c9ee4..2aa8424f0b1 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/calc_reduced_attn_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/calc_reduced_attn_kernel_register.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "kernels/flash_attn_utils.h"
+#include "kernels/metax_kernel/flash_attn_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/calc_reduced_attn_kernel.h"
 
diff --git a/backends/metax_gpu/kernels/funcs/softmax.cu b/backends/metax_gpu/kernels/funcs/softmax.cu
index d738a53f43a..44bfd02a308 100644
--- a/backends/metax_gpu/kernels/funcs/softmax.cu
+++ b/backends/metax_gpu/kernels/funcs/softmax.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <vector>
 
-#include "kernels/metax_context.h"
+#include "kernels/metax_kernel/metax_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_dnn.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/backends/metax_gpu/kernels/funcs/values_vectors_functor.h b/backends/metax_gpu/kernels/funcs/values_vectors_functor.h
index ec429950872..8c5996e680b 100644
--- a/backends/metax_gpu/kernels/funcs/values_vectors_functor.h
+++ b/backends/metax_gpu/kernels/funcs/values_vectors_functor.h
@@ -24,7 +24,7 @@
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/common/errors.h"
 #endif
-#include "kernels/metax_context.h"
+#include "kernels/metax_kernel/metax_context.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/memory_utils.h"
diff --git a/backends/metax_gpu/kernels/gpudnn/conv_cudnn_v7.h b/backends/metax_gpu/kernels/gpudnn/conv_cudnn_v7.h
index da61a1e5b41..a0f89047045 100644
--- a/backends/metax_gpu/kernels/gpudnn/conv_cudnn_v7.h
+++ b/backends/metax_gpu/kernels/gpudnn/conv_cudnn_v7.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "glog/logging.h"
-#include "metax_context.h"  //NOLINT
+#include "kernels/metax_kernel/metax_context.h"  //NOLINT
 #include "paddle/phi/backends/gpu/cuda/cuda_graph_with_memory_pool.h"
 #include "paddle/phi/kernels/autotune/switch_autotune.h"
 #include "paddle/phi/kernels/gpudnn/conv_gpudnn_base.h"
diff --git a/backends/metax_gpu/kernels/gpudnn/conv_transpose_grad_kernel_register.cu b/backends/metax_gpu/kernels/gpudnn/conv_transpose_grad_kernel_register.cu
index 0067818d165..b7eebfcee2e 100644
--- a/backends/metax_gpu/kernels/gpudnn/conv_transpose_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/gpudnn/conv_transpose_grad_kernel_register.cu
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <algorithm>
 
 #include "kernels/gpudnn/conv_cudnn_v7.h"
-#include "kernels/metax_context.h"
+#include "kernels/metax_kernel/metax_context.h"
 #include "paddle/common/ddim.h"
 #include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/backends/dynload/cudnn.h"
diff --git a/backends/metax_gpu/kernels/gpudnn/pool_kernel_register.cu b/backends/metax_gpu/kernels/gpudnn/pool_kernel_register.cu
index c115f5ad930..1c2bfeedf34 100644
--- a/backends/metax_gpu/kernels/gpudnn/pool_kernel_register.cu
+++ b/backends/metax_gpu/kernels/gpudnn/pool_kernel_register.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "gpudnn/pool_gpudnn.h"
-#include "metax_context.h"  //NOLINT
+#include "kernels/metax_kernel/metax_context.h"  //NOLINT
 #include "paddle/phi/backends/gpu/gpu_dnn.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/full_kernel.h"
diff --git a/backends/metax_gpu/kernels/gpudnn/softmax_gpudnn.h b/backends/metax_gpu/kernels/gpudnn/softmax_gpudnn.h
index 168752700e9..5844886ad1b 100644
--- a/backends/metax_gpu/kernels/gpudnn/softmax_gpudnn.h
+++ b/backends/metax_gpu/kernels/gpudnn/softmax_gpudnn.h
@@ -25,7 +25,7 @@
 #include "paddle/phi/kernels/primitive/kernel_primitives.h"
 
 // See Note [ Why still include the fluid headers? ]
-#include "metax_context.h"  //NOLINT
+#include "kernels/metax_kernel/metax_context.h"  //NOLINT
 #include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/backends/gpu/gpu_dnn.h"
 
diff --git a/backends/metax_gpu/kernels/impl/dirichlet_kernel_impl.h b/backends/metax_gpu/kernels/impl/dirichlet_kernel_impl.h
index 70af87513e5..c2e2e341bf5 100644
--- a/backends/metax_gpu/kernels/impl/dirichlet_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/dirichlet_kernel_impl.h
@@ -17,7 +17,7 @@
 #include <cmath>
 #include <random>
 
-#include "kernels/elementwise.h"
+#include "kernels/metax_kernel/elementwise.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
diff --git a/backends/metax_gpu/kernels/cuda_kernels/addmm_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/addmm_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/addmm_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/addmm_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/addmm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/addmm_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/addmm_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/addmm_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/batch_fc_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/batch_fc_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/batch_fc_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/batch_fc_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/metax_kernel/batch_norm_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/batch_norm_grad_kernel_register.cu
index 062646bbf9d..52fe5a1d566 100644
--- a/backends/metax_gpu/kernels/metax_kernel/batch_norm_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/batch_norm_grad_kernel_register.cu
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "glog/logging.h"
-#include "kernels/metax_context.h"
+#include "kernels/metax_kernel/metax_context.h"
 #include "paddle/common/flags.h"
 #include "paddle/common/layout.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
diff --git a/backends/metax_gpu/kernels/cuda_kernels/batch_norm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/batch_norm_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/batch_norm_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/batch_norm_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/bilinear_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/bilinear_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/bilinear_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/bilinear_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/bilinear_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/bilinear_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/bilinear_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/bilinear_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/metax_kernel/blha_get_max_len_register.cu b/backends/metax_gpu/kernels/metax_kernel/blha_get_max_len_register.cu
index bc9eb23c0e8..42810569fde 100644
--- a/backends/metax_gpu/kernels/metax_kernel/blha_get_max_len_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/blha_get_max_len_register.cu
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "kernels/flash_attn_utils.h"
 #include "kernels/metax_kernel/block_attn.h"
+#include "kernels/metax_kernel/flash_attn_utils.h"
 #include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/backends/metax_gpu/kernels/cuda_kernels/bmm_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/bmm_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/bmm_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/bmm_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/bmm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/bmm_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/bmm_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/bmm_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/c_softmax_with_multi_label_cross_entropy_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/c_softmax_with_multi_label_cross_entropy_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/c_softmax_with_multi_label_cross_entropy_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/c_softmax_with_multi_label_cross_entropy_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/cholesky_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/cholesky_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/cholesky_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/cholesky_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/metax_kernel/cholesky_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/cholesky_kernel_register.cu
index e8fae2d9da5..8a39ae3f0a8 100644
--- a/backends/metax_gpu/kernels/metax_kernel/cholesky_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/cholesky_kernel_register.cu
@@ -20,7 +20,7 @@ limitations under the License. */
 #include <algorithm>
 #include <vector>
 
-#include "kernels/metax_context.h"
+#include "kernels/metax_kernel/metax_context.h"
 #include "paddle/phi/backends/dynload/cusolver.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/memory_utils.h"
diff --git a/backends/metax_gpu/kernels/cuda_kernels/conv_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/conv_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/conv_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/conv_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/conv_transpose_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/conv_transpose_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/conv_transpose_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/conv_transpose_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/crop_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/crop_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/crop_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/crop_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/metax_kernel/cross_entropy_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/cross_entropy_kernel_register.cu
index e94862ec7b0..043a64dc149 100644
--- a/backends/metax_gpu/kernels/metax_kernel/cross_entropy_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/cross_entropy_kernel_register.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "glog/logging.h"
-#include "kernels/metax_context.h"
+#include "kernels/metax_kernel/metax_context.h"
 #include "paddle/phi/kernels/cross_entropy_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
 
diff --git a/backends/metax_gpu/kernels/cuda_kernels/depthwise_conv_grad_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/depthwise_conv_grad_kernel.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/depthwise_conv_grad_kernel.cu
rename to backends/metax_gpu/kernels/metax_kernel/depthwise_conv_grad_kernel.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/depthwise_conv_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/depthwise_conv_kernel.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/depthwise_conv_kernel.cu
rename to backends/metax_gpu/kernels/metax_kernel/depthwise_conv_kernel.cu
diff --git a/backends/metax_gpu/kernels/elementwise.h b/backends/metax_gpu/kernels/metax_kernel/elementwise.h
similarity index 100%
rename from backends/metax_gpu/kernels/elementwise.h
rename to backends/metax_gpu/kernels/metax_kernel/elementwise.h
diff --git a/backends/metax_gpu/kernels/flags_declare.cu b/backends/metax_gpu/kernels/metax_kernel/flags_declare.cu
similarity index 100%
rename from backends/metax_gpu/kernels/flags_declare.cu
rename to backends/metax_gpu/kernels/metax_kernel/flags_declare.cu
diff --git a/backends/metax_gpu/kernels/flash_attn_grad_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/flash_attn_grad_kernel.cu
similarity index 100%
rename from backends/metax_gpu/kernels/flash_attn_grad_kernel.cu
rename to backends/metax_gpu/kernels/metax_kernel/flash_attn_grad_kernel.cu
diff --git a/backends/metax_gpu/kernels/flash_attn_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/flash_attn_kernel.cu
similarity index 100%
rename from backends/metax_gpu/kernels/flash_attn_kernel.cu
rename to backends/metax_gpu/kernels/metax_kernel/flash_attn_kernel.cu
diff --git a/backends/metax_gpu/kernels/flash_attn_kernel.h b/backends/metax_gpu/kernels/metax_kernel/flash_attn_kernel.h
similarity index 100%
rename from backends/metax_gpu/kernels/flash_attn_kernel.h
rename to backends/metax_gpu/kernels/metax_kernel/flash_attn_kernel.h
diff --git a/backends/metax_gpu/kernels/flash_attn_utils.h b/backends/metax_gpu/kernels/metax_kernel/flash_attn_utils.h
similarity index 100%
rename from backends/metax_gpu/kernels/flash_attn_utils.h
rename to backends/metax_gpu/kernels/metax_kernel/flash_attn_utils.h
diff --git a/backends/metax_gpu/kernels/flashattn.cc b/backends/metax_gpu/kernels/metax_kernel/flashattn.cc
similarity index 100%
rename from backends/metax_gpu/kernels/flashattn.cc
rename to backends/metax_gpu/kernels/metax_kernel/flashattn.cc
diff --git a/backends/metax_gpu/kernels/flashattn.h b/backends/metax_gpu/kernels/metax_kernel/flashattn.h
similarity index 100%
rename from backends/metax_gpu/kernels/flashattn.h
rename to backends/metax_gpu/kernels/metax_kernel/flashattn.h
diff --git a/backends/metax_gpu/kernels/cuda_kernels/flatten2_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/flatten2_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/flatten2_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/flatten2_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/flatten2_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/flatten2_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/flatten2_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/flatten2_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_conv2d_add_act_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/fused_conv2d_add_act_kernel_register.cu
similarity index 99%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_conv2d_add_act_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/fused_conv2d_add_act_kernel_register.cu
index 48809ceefa4..c0d15b7f1b4 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/fused_conv2d_add_act_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/fused_conv2d_add_act_kernel_register.cu
@@ -308,7 +308,8 @@ class CudnnConvDescManager {
       int groups,
       cudnnDataType_t dtype) {
     auto* desc = new phi::backends::gpu::ConvolutionDescriptor();
-    desc->set(dtype, paddings, strides, dilations, phi::AllowTF32Cudnn(), groups);
+    desc->set(
+        dtype, paddings, strides, dilations, phi::AllowTF32Cudnn(), groups);
     return desc;
   }
 
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_rope_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/fused_rope_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_rope_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/fused_rope_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_rope_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/fused_rope_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_rope_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/fused_rope_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/metax_kernel/instance_norm_grad_kerne_registerl.cu b/backends/metax_gpu/kernels/metax_kernel/instance_norm_grad_kerne_registerl.cu
index d7540d949a9..bdf341f5a35 100644
--- a/backends/metax_gpu/kernels/metax_kernel/instance_norm_grad_kerne_registerl.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/instance_norm_grad_kerne_registerl.cu
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "glog/logging.h"
-#include "kernels/metax_context.h"
+#include "kernels/metax_kernel/metax_context.h"
 #include "paddle/common/layout.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/backends/metax_gpu/kernels/metax_kernel/instance_norm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/instance_norm_kernel_register.cu
index db975d74665..e0c0ae9c1d6 100644
--- a/backends/metax_gpu/kernels/metax_kernel/instance_norm_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/instance_norm_kernel_register.cu
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "glog/logging.h"
-#include "kernels/metax_context.h"
+#include "kernels/metax_kernel/metax_context.h"
 #include "paddle/common/layout.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/backends/metax_gpu/kernels/layer_norm_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/layer_norm_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/layer_norm_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/layer_norm_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/layer_norm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/layer_norm_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/layer_norm_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/layer_norm_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/lstm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/lstm_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/lstm_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/lstm_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/metax_kernel/lu_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/lu_kernel_register.cu
index 5a2d85418a1..72e4c5b2b79 100644
--- a/backends/metax_gpu/kernels/metax_kernel/lu_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/lu_kernel_register.cu
@@ -18,7 +18,7 @@
 #include "paddle/phi/backends/dynload/cusolver.h"
 #endif
 
-#include "kernels/metax_context.h"
+#include "kernels/metax_kernel/metax_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/enforce.h"
diff --git a/backends/metax_gpu/kernels/cuda_kernels/lu_solve_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/lu_solve_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/lu_solve_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/lu_solve_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/metax_kernel/matrix_rank_tol_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/matrix_rank_tol_kernel.cu
index bda5dc62f1a..d8c3355e6e4 100644
--- a/backends/metax_gpu/kernels/metax_kernel/matrix_rank_tol_kernel.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/matrix_rank_tol_kernel.cu
@@ -18,7 +18,7 @@
 #include <algorithm>
 #include <vector>
 
-#include "kernels/metax_context.h"
+#include "kernels/metax_kernel/metax_context.h"
 #include "paddle/phi/backends/dynload/cusolver.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/common/type_traits.h"
diff --git a/backends/metax_gpu/kernels/metax_context.cc b/backends/metax_gpu/kernels/metax_kernel/metax_context.cc
similarity index 90%
rename from backends/metax_gpu/kernels/metax_context.cc
rename to backends/metax_gpu/kernels/metax_kernel/metax_context.cc
index f0c92f00565..62aaa5fb2de 100644
--- a/backends/metax_gpu/kernels/metax_context.cc
+++ b/backends/metax_gpu/kernels/metax_kernel/metax_context.cc
@@ -12,27 +12,27 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "kernels/metax_context.h"
+#include "kernels/metax_kernel/metax_context.h"
 
 namespace phi {
 const bool allow_tf32_cublas = []() -> bool {
-    const char* v = std::getenv("ALLOW_TF32_CUBLAS");
-    if (v) {
-      return std::atoi(v);
-    }
-    return false;
+  const char* v = std::getenv("ALLOW_TF32_CUBLAS");
+  if (v) {
+    return std::atoi(v);
+  }
+  return false;
 }();
 
 const bool allow_tf32_cudnn = []() -> bool {
-    const char* v = std::getenv("ALLOW_TF32_CUDNN");
-    if (v) {
-      return std::atoi(v);
-    }
-    return false;
+  const char* v = std::getenv("ALLOW_TF32_CUDNN");
+  if (v) {
+    return std::atoi(v);
+  }
+  return false;
 }();
 
 bool AllowTF32Cublas() { return allow_tf32_cublas; }
-bool AllowTF32Cudnn()  { return allow_tf32_cudnn;  }
+bool AllowTF32Cudnn() { return allow_tf32_cudnn; }
 
 void DnnWorkspaceHandle::RunFuncSync(
     const std::function<void(void*)>& cudnn_func,
diff --git a/backends/metax_gpu/kernels/metax_context.h b/backends/metax_gpu/kernels/metax_kernel/metax_context.h
similarity index 96%
rename from backends/metax_gpu/kernels/metax_context.h
rename to backends/metax_gpu/kernels/metax_kernel/metax_context.h
index 683a6df7017..a6610c1dab2 100644
--- a/backends/metax_gpu/kernels/metax_context.h
+++ b/backends/metax_gpu/kernels/metax_kernel/metax_context.h
@@ -11,8 +11,8 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef BACKENDS_METAX_GPU_KERNELS_METAX_CONTEXT_H_
-#define BACKENDS_METAX_GPU_KERNELS_METAX_CONTEXT_H_
+#ifndef BACKENDS_METAX_GPU_KERNELS_METAX_KERNEL_METAX_CONTEXT_H_
+#define BACKENDS_METAX_GPU_KERNELS_METAX_KERNEL_METAX_CONTEXT_H_
 #include <array>
 #include <functional>
 #include <mutex>
@@ -161,4 +161,4 @@ inline DnnWorkspaceHandle GetDnnWorkspace(Allocator* alloactor,
   return DnnWorkspaceHandle(alloactor, stream);
 }
 }  // namespace phi
-#endif  // BACKENDS_METAX_GPU_KERNELS_METAX_CONTEXT_H_
+#endif  // BACKENDS_METAX_GPU_KERNELS_METAX_KERNEL_METAX_CONTEXT_H_
diff --git a/backends/metax_gpu/kernels/cuda_kernels/multi_dot_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/multi_dot_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/multi_dot_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/multi_dot_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/multi_dot_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/multi_dot_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/multi_dot_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/multi_dot_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/mv_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/mv_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/mv_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/mv_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/mv_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/mv_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/mv_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/mv_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/metax_kernel/qr_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/qr_kernel_register.cu
index 745069e2eda..c3041254444 100644
--- a/backends/metax_gpu/kernels/metax_kernel/qr_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/qr_kernel_register.cu
@@ -22,7 +22,7 @@
 #include <algorithm>
 #include <vector>
 
-#include "kernels/metax_context.h"
+#include "kernels/metax_kernel/metax_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/enforce.h"
diff --git a/backends/metax_gpu/kernels/cuda_kernels/rank_attention_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/rank_attention_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/rank_attention_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/rank_attention_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/rank_attention_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/rank_attention_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/rank_attention_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/rank_attention_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/metax_kernel/rnn_grad_kernel.cu.cc b/backends/metax_gpu/kernels/metax_kernel/rnn_grad_kernel.cu.cc
index 499832049e4..101b51aa350 100644
--- a/backends/metax_gpu/kernels/metax_kernel/rnn_grad_kernel.cu.cc
+++ b/backends/metax_gpu/kernels/metax_kernel/rnn_grad_kernel.cu.cc
@@ -14,7 +14,7 @@
 
 #include "paddle/phi/kernels/rnn_grad_kernel.h"
 
-#include "kernels/metax_context.h"  //NOLINT
+#include "kernels/metax_kernel/metax_context.h"  //NOLINT
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
diff --git a/backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc b/backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc
index f1cf9e09dc7..2598ce093e6 100644
--- a/backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc
+++ b/backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc
@@ -15,7 +15,7 @@
 #include "paddle/phi/kernels/rnn_kernel.h"
 
 #include "glog/logging.h"
-#include "kernels/metax_context.h"  //NOLINT
+#include "kernels/metax_kernel/metax_context.h"  //NOLINT
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/generator.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/backends/metax_gpu/kernels/cuda_kernels/slogdeterminant_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/slogdeterminant_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/slogdeterminant_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/slogdeterminant_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/softmax_kernel_grad_register.cu b/backends/metax_gpu/kernels/metax_kernel/softmax_kernel_grad_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/softmax_kernel_grad_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/softmax_kernel_grad_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/softmax_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/softmax_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/softmax_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/softmax_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/solve_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/solve_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/solve_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/solve_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/standard_gamma_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/standard_gamma_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/standard_gamma_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/standard_gamma_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/stft_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/stft_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/stft_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/stft_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/svd_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/svd_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/svd_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/svd_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/top_k_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/top_k_grad_kernel_register.cu
old mode 100755
new mode 100644
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/top_k_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/top_k_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/triangular_solve_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/triangular_solve_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/triangular_solve_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/triangular_solve_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/triangular_solve_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/triangular_solve_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/triangular_solve_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/triangular_solve_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/warprnnt_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/warprnnt_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/warprnnt_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/warprnnt_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/weight_only_linear_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/weight_only_linear_kernel.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/weight_only_linear_kernel.cu
rename to backends/metax_gpu/kernels/metax_kernel/weight_only_linear_kernel.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/weight_quantize_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/weight_quantize_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu
diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch
index 0283a443adb..e56826c4f3e 100755
--- a/backends/metax_gpu/patch/paddle.patch
+++ b/backends/metax_gpu/patch/paddle.patch
@@ -16,16 +16,16 @@ index cfada544d4..a690e97d74 100644
 -  set(EIGEN_PATCH_COMMAND ${EIGEN_PATCH_COMMAND} && git apply ${complex_header})
 +  # set(EIGEN_PATCH_COMMAND ${EIGEN_PATCH_COMMAND} && git apply ${complex_header})
  endif()
- 
+
  set(EIGEN_INCLUDE_DIR ${SOURCE_DIR})
 diff --git a/paddle/fluid/platform/profiler/cupti_data_process.cc b/paddle/fluid/platform/profiler/cupti_data_process.cc
 index bff0f2bf70..9376b5781f 100644
 --- a/paddle/fluid/platform/profiler/cupti_data_process.cc
 +++ b/paddle/fluid/platform/profiler/cupti_data_process.cc
 @@ -16,7 +16,7 @@
- 
+
  #include <cstdio>
- 
+
 -#include "paddle/fluid/platform/enforce.h"
 +// #include "paddle/fluid/platform/enforce.h"
  #include "paddle/phi/core/os_info.h"
@@ -76,7 +76,7 @@ index c0080f0a5e..458ca3e2e8 100644
 +  __macro(cudnnDestroyActivationDescriptor);               \
 +  __macro(cudnnSetRNNDescriptor_v6);
  CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
- 
+
  #if CUDNN_VERSION >= 7000 && CUDNN_VERSION < 8000
 @@ -152,7 +161,12 @@ CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
  #define CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(__macro) \
@@ -91,11 +91,11 @@ index c0080f0a5e..458ca3e2e8 100644
 +  __macro(cudnnRNNForwardInferenceEx);
  CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
  #endif
- 
+
 @@ -195,40 +209,6 @@ CUDNN_DNN_ROUTINE_EACH_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
  CUDNN_DNN_ROUTINE_EACH_FRONTEND(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
  #endif
- 
+
 -#if CUDNN_VERSION < 90000
 -#define CUDNN_DNN_ROUTINE_EACH_REMOVED_IN_E9(__macro) \
 -  __macro(cudnnGetRNNParamsSize);                     \
@@ -132,15 +132,15 @@ index c0080f0a5e..458ca3e2e8 100644
 -#endif
  }  // namespace dynload
  }  // namespace phi
- 
+
 diff --git a/paddle/phi/backends/dynload/cufft.h b/paddle/phi/backends/dynload/cufft.h
 index 1547909d92..66b2779392 100644
 --- a/paddle/phi/backends/dynload/cufft.h
 +++ b/paddle/phi/backends/dynload/cufft.h
 @@ -1,3 +1,4 @@
-+// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved.   
++// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved.
  /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
- 
+
  Licensed under the Apache License, Version 2.0 (the "License");
 @@ -40,7 +41,9 @@ extern void EnforceCUFFTLoaded(const char* fn_name);
          cufft_dso_handle = phi::dynload::GetCUFFTDsoHandle();        \
@@ -160,23 +160,23 @@ index 59e92955c9..d2f8c2da15 100644
 @@ -24,8 +24,8 @@ limitations under the License. */
  #include "paddle/phi/backends/dynload/dynamic_loader.h"
  #include "paddle/phi/common/port.h"
- 
+
 -namespace phi {
 -namespace dynload {
 +// namespace phi {
 +// namespace dynload {
- 
+
  extern std::once_flag cupti_dso_flag;
  extern void *cupti_dso_handle;
 @@ -71,7 +71,7 @@ extern void *cupti_dso_handle;
  CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP);
- 
+
  #undef DECLARE_DYNAMIC_LOAD_CUPTI_WRAP
 -}  // namespace dynload
 -}  // namespace phi
 +// }  // namespace dynload
 +// }  // namespace phi
- 
+
 -#endif  // PADDLE_WITH_CUPTI
 +#endif  // PADDLE_WITH_CUPTI
 \ No newline at end of file
@@ -230,28 +230,28 @@ index 4ff2e528a9..81421c8ca1 100644
 --- a/paddle/phi/backends/gpu/cuda/cuda_device_function.h
 +++ b/paddle/phi/backends/gpu/cuda/cuda_device_function.h
 @@ -1,3 +1,4 @@
-+// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved.   
++// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved.
  /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
- 
+
  Licensed under the Apache License, Version 2.0 (the "License");
 @@ -25,7 +26,7 @@ namespace phi {
  namespace backends {
  namespace gpu {
- 
+
 -#define FULL_WARP_MASK 0xFFFFFFFF
 +#define FULL_WARP_MASK 0xFFFFFFFFFFFFFFFFULL
  #define CREATE_SHFL_MASK(mask, predicate) \
    mask = __ballot_sync(FULL_WARP_MASK, (predicate))
- 
+
 @@ -45,12 +46,12 @@ namespace gpu {
- 
+
  template <typename T>
  __forceinline__ __device__ T
 -CudaShuffleDownSync(unsigned mask, T val, int delta, int width = warpSize) {
 +CudaShuffleDownSync(unsigned long long mask, T val, int delta, int width = warpSize) {
    return __shfl_down_sync(mask, val, static_cast<unsigned>(delta), width);
  }
- 
+
  template <typename T>
 -__forceinline__ __device__ T CudaShuffleXorSync(unsigned mask,
 +__forceinline__ __device__ T CudaShuffleXorSync(unsigned long long mask,
@@ -259,7 +259,7 @@ index 4ff2e528a9..81421c8ca1 100644
                                                  int width = warpSize) {
    return __shfl_xor_sync(mask, val, width);
 @@ -58,14 +59,14 @@ __forceinline__ __device__ T CudaShuffleXorSync(unsigned mask,
- 
+
  template <>
  __forceinline__ __device__ phi::dtype::float16 CudaShuffleDownSync(
 -    unsigned mask, phi::dtype::float16 val, int delta, int width) {
@@ -267,7 +267,7 @@ index 4ff2e528a9..81421c8ca1 100644
    return phi::dtype::float16(__shfl_down_sync(
        mask, val.to_half(), static_cast<unsigned>(delta), width));
  }
- 
+
  template <>
  __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleDownSync(
 -    unsigned mask, phi::dtype::bfloat16 val, int delta, int width) {
@@ -276,7 +276,7 @@ index 4ff2e528a9..81421c8ca1 100644
    return phi::dtype::bfloat16(__shfl_down_sync(
        mask, val.to_nv_bfloat16(), static_cast<unsigned>(delta), width));
 @@ -77,7 +78,7 @@ __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleDownSync(
- 
+
  template <>
  __forceinline__ __device__ phi::dtype::complex<float> CudaShuffleDownSync(
 -    unsigned mask, phi::dtype::complex<float> val, int delta, int width) {
@@ -285,7 +285,7 @@ index 4ff2e528a9..81421c8ca1 100644
        mask, static_cast<float>(val.real), static_cast<unsigned>(delta), width));
    float imag = static_cast<float>(__shfl_down_sync(
 @@ -87,7 +88,7 @@ __forceinline__ __device__ phi::dtype::complex<float> CudaShuffleDownSync(
- 
+
  template <>
  __forceinline__ __device__ phi::dtype::complex<double> CudaShuffleDownSync(
 -    unsigned mask, phi::dtype::complex<double> val, int delta, int width) {
@@ -294,14 +294,14 @@ index 4ff2e528a9..81421c8ca1 100644
        static_cast<double>(__shfl_down_sync(mask,
                                             static_cast<double>(val.real),
 @@ -103,13 +104,13 @@ __forceinline__ __device__ phi::dtype::complex<double> CudaShuffleDownSync(
- 
+
  template <>
  __forceinline__ __device__ phi::dtype::float16 CudaShuffleXorSync(
 -    unsigned mask, phi::dtype::float16 val, int width) {
 +    unsigned long long mask, phi::dtype::float16 val, int width) {
    return phi::dtype::float16(__shfl_xor_sync(mask, val.to_half(), width));
  }
- 
+
  template <>
  __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleXorSync(
 -    unsigned mask, phi::dtype::bfloat16 val, int width) {
@@ -310,7 +310,7 @@ index 4ff2e528a9..81421c8ca1 100644
    return phi::dtype::bfloat16(
        __shfl_xor_sync(mask, val.to_nv_bfloat16(), width));
 @@ -121,7 +122,7 @@ __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleXorSync(
- 
+
  template <>
  __forceinline__ __device__ phi::dtype::complex<float> CudaShuffleXorSync(
 -    unsigned mask, phi::dtype::complex<float> val, int width) {
@@ -319,7 +319,7 @@ index 4ff2e528a9..81421c8ca1 100644
        __shfl_xor_sync(mask, static_cast<float>(val.real), width));
    float imag = static_cast<float>(
 @@ -131,7 +132,7 @@ __forceinline__ __device__ phi::dtype::complex<float> CudaShuffleXorSync(
- 
+
  template <>
  __forceinline__ __device__ phi::dtype::complex<double> CudaShuffleXorSync(
 -    unsigned mask, phi::dtype::complex<double> val, int width) {
@@ -328,14 +328,14 @@ index 4ff2e528a9..81421c8ca1 100644
        __shfl_xor_sync(mask, static_cast<double>(val.real), width));
    double imag = static_cast<double>(
 @@ -141,7 +142,7 @@ __forceinline__ __device__ phi::dtype::complex<double> CudaShuffleXorSync(
- 
+
  template <typename T>
  __forceinline__ __device__ T
 -CudaShuffleSync(unsigned mask, T val, int src_line, int width = 32) {
 +CudaShuffleSync(unsigned long long mask, T val, int src_line, int width = 32) {
    return __shfl_sync(mask, val, src_line, width);
  }
- 
+
 @@ -160,7 +161,7 @@ __device__ T reduceSum(T val, int tid, int len) {
    // but most card's warp size is 32.
    const int warpSize = 32;
@@ -343,7 +343,7 @@ index 4ff2e528a9..81421c8ca1 100644
 -  unsigned mask = 0u;
 +  unsigned long long mask = 0ull;
    CREATE_SHFL_MASK(mask, tid < len);
- 
+
    for (int offset = warpSize / 2; offset > 0; offset /= 2)
 diff --git a/paddle/phi/core/enforce.h b/paddle/phi/core/enforce.h
 index 024a7de73e..1e4cdf16be 100644
@@ -351,7 +351,7 @@ index 024a7de73e..1e4cdf16be 100644
 +++ b/paddle/phi/core/enforce.h
 @@ -45,7 +45,9 @@ limitations under the License. */
  #endif
- 
+
  #ifdef PADDLE_WITH_CUDA
 -#include "paddle/phi/backends/dynload/cublas.h"
 +// #include "paddle/phi/backends/dynload/../../../../../cublas.h"
@@ -361,9 +361,9 @@ index 024a7de73e..1e4cdf16be 100644
  #include "paddle/phi/backends/dynload/curand.h"
  #include "paddle/phi/backends/dynload/cusolver.h"
 @@ -97,7 +99,7 @@ inline bool is_error(bool stat) { return !stat; }
- 
+
  void ThrowWarnInternal(const std::string& message);
- 
+
 -#if defined(__CUDA_ARCH__)
 +#if defined(__CUDACC__)
  // For cuda, the assertions can affect performance and it is therefore
@@ -379,7 +379,7 @@ index 024a7de73e..1e4cdf16be 100644
    } while (0)
  #elif defined(__HIPCC__)
 @@ -757,4 +759,4 @@ inline void retry_sleep(unsigned millisecond) {
- 
+
  }  // namespace enforce
  using namespace enforce;  // NOLINT
 -}  // namespace phi
@@ -392,7 +392,7 @@ index c646e487d0..325122175c 100644
 @@ -25,8 +25,9 @@
  #else
  #include <cuda_runtime.h>
- 
+
 -#include "paddle/phi/backends/dynload/cublas.h"
 -#include "paddle/phi/backends/dynload/cublasLt.h"
 +// #include "paddle/phi/backends/dynload/cublas.h"
@@ -400,16 +400,16 @@ index c646e487d0..325122175c 100644
 +// #include "paddle/phi/backends/dynload/cublasLt.h"
  #include "paddle/phi/backends/dynload/cudnn.h"
  #endif
- 
+
 @@ -90,7 +91,7 @@ DECLARE_TYPE_FOR_GPU(gpuStreamCaptureMode,
- 
+
  // TODO(Ming Huang): Since there is no blasLt handler,
  // use rocblas_handle for workaround.
 -DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle);
 +// DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle);
- 
+
  #undef DECLARE_TYPE_FOR_GPU
- 
+
 diff --git a/paddle/phi/core/platform/device_context.h b/paddle/phi/core/platform/device_context.h
 index 2d02eb370b..8a7233e34e 100644
 --- a/paddle/phi/core/platform/device_context.h
@@ -430,58 +430,58 @@ index d69eb67d6f..1d8b6e9375 100644
 --- a/paddle/phi/kernels/cpu/index_select_impl.h
 +++ b/paddle/phi/kernels/cpu/index_select_impl.h
 @@ -18,7 +18,7 @@
- 
+
  #include "paddle/phi/core/dense_tensor.h"
  #include "paddle/phi/core/tensor_utils.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/eigen/common.h"
  #include "paddle/phi/kernels/funcs/math_function.h"
- 
+
 diff --git a/paddle/phi/kernels/funcs/fc_functor.cu b/paddle/phi/kernels/funcs/fc_functor.cu
 index cb35feee32..64f5bd24ac 100644
 --- a/paddle/phi/kernels/funcs/fc_functor.cu
 +++ b/paddle/phi/kernels/funcs/fc_functor.cu
 @@ -16,12 +16,12 @@ limitations under the License. */
- 
+
  #include "paddle/phi/backends/all_context.h"
  #include "paddle/phi/kernels/funcs/aligned_vector.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/fc_functor.h"
- 
+
  #include "paddle/phi/backends/gpu/gpu_launch_config.h"
  #include "paddle/phi/core/dense_tensor.h"
 -#include "paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h"
 +// #include "paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h"
  #include "paddle/phi/kernels/funcs/quant_dequant.h"
  #include "paddle/phi/kernels/matmul_kernel.h"
- 
+
 diff --git a/paddle/phi/kernels/funcs/gru_compute.cu b/paddle/phi/kernels/funcs/gru_compute.cu
 index 88663ec880..98b93072a3 100644
 --- a/paddle/phi/kernels/funcs/gru_compute.cu
 +++ b/paddle/phi/kernels/funcs/gru_compute.cu
 @@ -12,7 +12,7 @@ limitations under the License. */
  #include "paddle/phi/kernels/funcs/gru_compute.h"
- 
+
  #include "paddle/phi/backends/gpu/gpu_context.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h"
  #include "paddle/phi/kernels/funcs/detail/gru_kernel.h"
- 
+
 diff --git a/paddle/phi/kernels/funcs/math/context_project.h b/paddle/phi/kernels/funcs/math/context_project.h
 index 15e1a4a3c3..e4780538d7 100644
 --- a/paddle/phi/kernels/funcs/math/context_project.h
 +++ b/paddle/phi/kernels/funcs/math/context_project.h
 @@ -18,7 +18,7 @@
  #include <vector>
- 
+
  #include "paddle/phi/core/tensor_utils.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/im2col.h"
- 
+
  namespace phi {
 diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cu b/paddle/phi/kernels/funcs/matrix_inverse.cu
 index e101224970..a52eb6096f 100644
@@ -489,14 +489,14 @@ index e101224970..a52eb6096f 100644
 +++ b/paddle/phi/kernels/funcs/matrix_inverse.cu
 @@ -15,11 +15,13 @@ limitations under the License. */
  #include "paddle/phi/kernels/funcs/matrix_inverse.h"
- 
+
  #include "paddle/phi/common/memory_utils.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
- 
+
  namespace phi {
  namespace funcs {
- 
+
 +
 +
  template <typename Context, typename T>
@@ -514,19 +514,19 @@ index 558d363b39..05da04b517 100644
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/math_function.h"
  #include "paddle/phi/kernels/funcs/scatter.cu.h"
- 
+
 diff --git a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
 index 8b0baf5f5f..260482f124 100644
 --- a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
 +++ b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
 @@ -27,7 +27,7 @@ namespace cub = hipcub;
- 
+
  #include "paddle/phi/kernels/funcs/multihead_matmul_functor.h"
- 
+
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/math_cuda_utils.h"
- 
+
  namespace phi {
 diff --git a/paddle/phi/kernels/funcs/top_k_function_cuda.h b/paddle/phi/kernels/funcs/top_k_function_cuda.h
 index e30d440ff3..3c74792690 100644
@@ -535,7 +535,7 @@ index e30d440ff3..3c74792690 100644
 @@ -30,11 +30,11 @@ limitations under the License. */
  #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
  #include "paddle/phi/kernels/primitive/functor_primitives.h"
- 
+
 -#define FINAL_MASK 0xffffffff
 +#define FINAL_MASK 0xffffffffffffffffull
  #ifdef PADDLE_WITH_HIP
@@ -545,7 +545,7 @@ index e30d440ff3..3c74792690 100644
 +#define WARP_SIZE 64
  #endif
  #define MAX_NUM_THREADS 1024
- 
+
 @@ -196,21 +196,56 @@ __device__ __forceinline__ void AddTo(Pair<T> topk[],
    for (int k = beam_size - 2; k >= 0; k--) {
      if (largest) {
@@ -606,7 +606,7 @@ index e30d440ff3..3c74792690 100644
 +  topk[0 + offset].v = p.v;
 +  topk[0 + offset].id = p.id;
  }
- 
+
  template <typename T, int BlockSize>
 @@ -239,24 +274,24 @@ __device__ __forceinline__ void GetTopK(Pair<T> topk[],
  template <typename T, int BlockSize>
@@ -662,7 +662,7 @@ index e30d440ff3..3c74792690 100644
 +            // topk + MaxLength - *beam, src, tid, dim, *max, length, largest);
        }
      }
- 
+
 @@ -355,6 +394,8 @@ __device__ __forceinline__ void BlockReduce(Pair<T> shared_max[],
        shared_max[wid] = input_now;
      }
@@ -697,7 +697,7 @@ index e30d440ff3..3c74792690 100644
 -    if (--(*k) == 0) break;
 +    // if (--(*k) == 0) break;
 +    unsigned long long mask = 0ull;
- 
+
 -    unsigned mask = 0u;
 +    // unsigned mask = 0u;
      CREATE_SHFL_MASK(mask, true);
@@ -721,14 +721,14 @@ index e30d440ff3..3c74792690 100644
 +
      return ret;
    }
- 
+
    static __device__ __forceinline__ unsigned int SetBitfield(
        unsigned int val, unsigned int to_insert, int pos, int len) {
      unsigned int ret;
 -    asm("bfi.b32 %0, %1, %2, %3, %4;"
 -        : "=r"(ret)
 -        : "r"(to_insert), "r"(val), "r"(pos), "r"(len));
-+    
++
 +    ret = (static_cast<unsigned int>(val) << (32 - pos - len)) >> (32 - len);
      return ret;
    }
@@ -738,12 +738,12 @@ index e30d440ff3..3c74792690 100644
                                                           int len) {
      uint64_t ret;
 -    asm("bfe.u64 %0, %1, %2, %3;" : "=l"(ret) : "l"(val), "r"(pos), "r"(len));
-+    
++
 +
 +    ret = (static_cast<uint64_t>(val) << (64 - pos - len)) >> (64 - len);
      return ret;
    }
- 
+
 @@ -507,9 +556,9 @@ struct Bitfield<uint64_t> {
                                                           int pos,
                                                           int len) {
@@ -751,7 +751,7 @@ index e30d440ff3..3c74792690 100644
 -    asm("bfi.b64 %0, %1, %2, %3, %4;"
 -        : "=l"(ret)
 -        : "l"(to_insert), "l"(val), "r"(pos), "r"(len));
-+    
++
 +  ret = (static_cast<uint64_t>(val) << (64 - pos - len)) >> (64 - len);
 +
      return ret;
@@ -763,7 +763,7 @@ index e30d440ff3..3c74792690 100644
    int lane_id;
 -  asm("mov.s32 %0, %%laneid;" : "=r"(lane_id));
 -  return lane_id;
-+  
++
 +// // >>>> PTX2CPP Success <<<<
 +// {
 +// (lane_id)=(threadIdx.x&(warpSize-1));
@@ -771,7 +771,7 @@ index e30d440ff3..3c74792690 100644
 +  return ::__lane_id();
 +  // return lane_id;
  }
- 
+
  __device__ __forceinline__ unsigned GetLaneMaskLe() {
    unsigned mask;
 -  asm("mov.u32 %0, %%lanemask_le;" : "=r"(mask));
@@ -780,17 +780,17 @@ index e30d440ff3..3c74792690 100644
 +  return ((uint64_t(1) << ::__lane_id()) << 1) - 1;
 +  // return mask;
  }
- 
+
  template <typename T, bool KillDependency, class Function>
 @@ -881,7 +936,8 @@ __global__ void GatherKthValue(const T* input,
- 
+
    // 1. Find the k-th value
    T kth_value = static_cast<T>(0);
 -  RadixSearch<T, RadixTypeConfig<T>::RadixType, IndexType, false>(
 +  // RadixSearch<T, RadixTypeConfig<T>::RadixType, IndexType, false>(
 +  RadixSearch<T, typename RadixTypeConfig<T>::RadixType, IndexType, false>(
        cur_input, k, num_cols, shared_mem, &kth_value);
- 
+
    __shared__ int64_t block_min_idx;
 @@ -1314,3 +1370,4 @@ bool SortTopk(const phi::GPUContext& dev_ctx,
  }
@@ -803,12 +803,12 @@ index 32db61532f..0220316bc3 100644
 +++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h
 @@ -15,7 +15,7 @@
  #pragma once
- 
+
  #if defined(PADDLE_WITH_CUDA)
 -#include "paddle/phi/backends/dynload/cublasLt.h"
 +// #include "paddle/phi/backends/dynload/cublasLt.h"
  #endif
- 
+
  #include "glog/logging.h"
 diff --git a/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h b/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h
 index 9d4bb18d55..ea42cc10a9 100644
@@ -830,12 +830,12 @@ index b8cfdbf3ce..fa14b94a77 100644
 --- a/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu
 +++ b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu
 @@ -14,7 +14,7 @@
- 
+
  #include "paddle/phi/core/kernel_registry.h"
  #include "paddle/phi/kernels/funcs/aligned_vector.h"
 -#include "paddle/phi/kernels/fusion/gpu/mmha_util.cu.h"
 +#include "kernels/metax_kernel/mmha_util.cu.h"
- 
+
  namespace phi {
  namespace fusion {
 diff --git a/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu b/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu
@@ -843,12 +843,12 @@ index e838778952..83e805e75a 100644
 --- a/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu
 +++ b/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu
 @@ -14,7 +14,7 @@
- 
+
  #include "paddle/phi/core/kernel_registry.h"
  #include "paddle/phi/kernels/funcs/aligned_vector.h"
 -#include "paddle/phi/kernels/fusion/gpu/mmha_util.cu.h"
 +#include "kernels/metax_kernel/mmha_util.cu.h"
- 
+
  namespace phi {
  namespace fusion {
 diff --git a/paddle/phi/kernels/gpu/depthwise_conv.h b/paddle/phi/kernels/gpu/depthwise_conv.h
@@ -863,7 +863,7 @@ index f0cca0f701..02ea957240 100644
 -#include "paddle/phi/kernels/impl/conv_cudnn_impl.h"
 +#include "kernels/gpudnn/conv_gpudnn.h"
 +#include "kernels/impl/conv_cudnn_impl.h"
- 
+
  namespace phi {
  // To determine use cudnn or not.
 diff --git a/paddle/phi/kernels/gpu/gelu_funcs.h b/paddle/phi/kernels/gpu/gelu_funcs.h
@@ -890,7 +890,7 @@ index 29fa252e96..4ae72b0935 100644
 +// #endif
    return tanhf(x);
  }
- 
+
 diff --git a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu
 index 11efd87965..679db14c24 100644
 --- a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu
@@ -901,9 +901,9 @@ index 11efd87965..679db14c24 100644
  #include "paddle/phi/kernels/funcs/math_function.h"
 -#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
 +#include "kernels/gpudnn/softmax_gpudnn.h"
- 
+
  namespace phi {
- 
+
 diff --git a/paddle/phi/kernels/gpu/log_softmax_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_kernel.cu
 index 63c35dd4ee..15da9aea45 100644
 --- a/paddle/phi/kernels/gpu/log_softmax_kernel.cu
@@ -914,9 +914,9 @@ index 63c35dd4ee..15da9aea45 100644
  #include "paddle/phi/kernels/funcs/math_function.h"
 -#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
 +#include "kernels/gpudnn/softmax_gpudnn.h"
- 
+
  namespace phi {
- 
+
 diff --git a/paddle/phi/kernels/gpu/lstsq_kernel.cu b/paddle/phi/kernels/gpu/lstsq_kernel.cu
 index 1bdbe1564c..f753b54bc6 100644
 --- a/paddle/phi/kernels/gpu/lstsq_kernel.cu
@@ -948,7 +948,7 @@ index cf80666b4e..ca76e055fb 100644
 --- a/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h
 @@ -19,7 +19,7 @@ limitations under the License. */
- 
+
  #include "paddle/phi/common/amp_type_traits.h"
  #include "paddle/phi/kernels/baddbmm_grad_kernel.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
@@ -961,14 +961,14 @@ index 2789cb59a2..b91b076f7f 100644
 --- a/paddle/phi/kernels/impl/baddbmm_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/baddbmm_kernel_impl.h
 @@ -20,7 +20,7 @@ limitations under the License. */
- 
+
  #include "paddle/phi/common/amp_type_traits.h"
  #include "paddle/phi/kernels/baddbmm_kernel.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/eigen/common.h"
  #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
- 
+
 diff --git a/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h
 index 9a21c23666..86413d1577 100644
 --- a/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h
@@ -993,7 +993,7 @@ index 4459a931da..837c8682b8 100644
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/deformable_conv_functor.h"
- 
+
  namespace phi {
 diff --git a/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
 index ad9e9197dd..5478d9817d 100644
@@ -1013,27 +1013,27 @@ index e6b3960f6d..564125f1f6 100644
 --- a/paddle/phi/kernels/impl/gammaincc_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/gammaincc_kernel_impl.h
 @@ -56,8 +56,8 @@ HOSTDEVICE T igam(const T a, const T x) {
- 
+
  template <typename T>
  HOSTDEVICE T igamc(const T a, const T x) {
 -  static T big = 4.503599627370496e15;
 -  static T biginv = 2.22044604925031308085e-16;
 +  const static T big = 4.503599627370496e15;
 +  const static T biginv = 2.22044604925031308085e-16;
- 
+
    if ((x <= T{0}) || (a <= T{0})) return (T{1.0});
- 
+
 diff --git a/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h b/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h
 index 410fb3c560..009ce03440 100644
 --- a/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h
 @@ -54,7 +54,7 @@ HOSTDEVICE T digamma_positive_domain(T x) {
- 
+
  template <typename T>
  HOSTDEVICE T digamma(T x) {
 -  static T pi = T{3.14159265358979323846};
 +  const static T pi = T{3.14159265358979323846};
- 
+
    if (x == T{0.0}) {
      T inf = std::numeric_limits<T>::infinity();
 diff --git a/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h b/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h
@@ -1048,12 +1048,12 @@ index 5ebbc8d2db..48acf8d0cd 100644
 -#include "paddle/phi/kernels/funcs/quant_dequant.h"
 +#include "kernels/funcs/blas/cublaslt.h"
 +#include "kernels/funcs/quant_dequant.h"
-+#include "kernels/metax_context.h"
- 
++#include "kernels/metax_kernel/metax_context.h"
+
  #pragma once
- 
+
 @@ -668,7 +669,7 @@ void LLMGemm(const phi::GPUContext& dev_ctx,
- 
+
    {
      auto helper =
 -        std::make_unique<CublasLtHelper>(m, k, n, dev_ctx.cublaslt_handle());
@@ -1067,12 +1067,12 @@ index 1f319c4ae3..9186eb6906 100644
 +++ b/paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h
 @@ -15,7 +15,7 @@ limitations under the License. */
  #pragma once
- 
+
  #include "paddle/phi/core/dense_tensor.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/matrix_inverse.h"
- 
+
  namespace phi {
 diff --git a/paddle/phi/kernels/impl/matrix_power_kernel_impl.h b/paddle/phi/kernels/impl/matrix_power_kernel_impl.h
 index 6f03f76eeb..5fe2c3e7dc 100644
@@ -1080,13 +1080,13 @@ index 6f03f76eeb..5fe2c3e7dc 100644
 +++ b/paddle/phi/kernels/impl/matrix_power_kernel_impl.h
 @@ -15,7 +15,7 @@ limitations under the License. */
  #pragma once
- 
+
  #include "paddle/phi/core/dense_tensor.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/for_range.h"
  #include "paddle/phi/kernels/funcs/matrix_inverse.h"
- 
+
 diff --git a/paddle/phi/kernels/impl/merged_momentum_impl.h b/paddle/phi/kernels/impl/merged_momentum_impl.h
 index 7b85903776..3f4b298807 100644
 --- a/paddle/phi/kernels/impl/merged_momentum_impl.h
@@ -1118,14 +1118,14 @@ index 4099d8b506..baef2cd643 100644
 --- a/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h
 @@ -14,7 +14,7 @@
- 
+
  #pragma once
- 
+
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/eigen/common.h"
  #include "paddle/phi/kernels/funcs/math_function.h"
- 
+
 diff --git a/third_party/flagcx b/third_party/flagcx
 index 7c469f4af9..7e6c4cc3ca 160000
 --- a/third_party/flagcx
diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt
index 37475773026..410ef006514 100755
--- a/backends/metax_gpu/tests/CMakeLists.txt
+++ b/backends/metax_gpu/tests/CMakeLists.txt
@@ -87,32 +87,34 @@ list(
 list(
   REMOVE_ITEM
   PYTHON_TEST_SCRIPTS
-  ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py # 精度问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py # 受 test_sum_op.py 影响
-  ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py # 精度问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py # core.cudnnversion
-                                                                   # 适配问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py # core.cudnnversion 适配问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py # core.cudnnversion 适配问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py # op_test.py 里
-                                                        # self._get_places()
-                                                        # 接口适配问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py # device == "gpu" 适配问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py # paddle-gpu 报错一致
-  ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py # paddle-gpu 报错一致
-  ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py # core.cudnnversion 适配问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py # paddle-gpu 报错一致
-  ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py # paddle-gpu 报错一致
-  ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py # paddle.device.cuda.get_device_properties
-  ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py # needs check_grad with fp64
-                                                    # precision
-  ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py # op_test.py 里
-                                                   # self._get_places() 接口适配问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_slice_op.py # CUDAPinnedPlace 问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_randint_op.py # paddle.device.cuda.get_device_properties
-  ${PADDLE_LEGACY_TEST_PATH}/test_compare_op.py # CUDAPinnedPlace 问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_uniform_random_op.py # paddle.device.cuda.get_device_properties
-)
+  # 精度问题
+  ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py
+  # core.cudnnversion
+  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py
+  # op_test.py 里 self._get_places()接口适配问题
+  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py
+  # device == "gpu" 适配问题
+  ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py
+  # paddle-gpu 报错一致
+  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py
+  # paddle.device.cuda.get_device_properties
+  ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_randint_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_uniform_random_op.py
+  # needs check_grad with fp64 precision
+  ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py
+  # CUDAPinnedPlace 问题
+  ${PADDLE_LEGACY_TEST_PATH}/test_slice_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_compare_op.py)
 
 list(REMOVE_DUPLICATES PYTHON_TEST_SCRIPTS)
 foreach(test_script ${PYTHON_TEST_SCRIPTS})

From 23fca59cd47c30680a01e9ec79f5d4d16d156320 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Wed, 17 Sep 2025 10:44:44 +0800
Subject: [PATCH 020/121] [metax]fix_code style and
 index_elementwise_put_kernel (#27)

* [Metax_change_ut]

* fix sum&collect_fpn_proposals op register

* modify profile

* [Metax] fix paddle bug replace 'MoeGradDispatchKernel' to 'MoeGateDispatchKernel'

* [Metax] register bce_loss_grad & bce_loss & index_add_grad kernels

* [Metax] con2d_grad use gpudnn

* blas handle support

* [Metax] register some kernels & update CMakeLists

* [Metax] fix metax unittest fail

* [Metax] add group_norm & label_smooth kernel and update matmul kernel

* [Metax] fix rmsprop kernel register and add meshgrid & meshgrid_grad kernel register

* add test

* add test

* [test]  chang the logic of workspace_host in cholesky_kernel_register

alloc(cpuplace,size), test pass
alloc(cpuplace, size, stream), crash

* [Metax] fix compile fail

* Revert "[Metax] fix compile fail"

This reverts commit 83bc87f686227962b0262e044225c6ed5507b824.

* [Metax] fix compile fail by 'conv_transpose_grad_kernel_impl.h'

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] con2d_grad use gpudnn

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] change_patch

* [Metax] update unit test CMakeLists.txt

* [Metax] update unit test CMakeLists.txt

* [feature] add unique_consecutive kernel

* [metax] add some kernel

* [metax] add some kernel

* [Metax] register baddbmm kernel & update blas api

* [Metax] register baddbmm kernel & update blas api

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [feature]  add add unique_consecutive kernel.cu

* [fix] fix some test case due to missing op register

* [fix]  fix some fail text

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax]fix lu eigvalshsqueeze rnn kernel

* add and fix some kernels

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [Metax] fix conflict

* [Metax] adapt to paddle-cpu-20250901 & resolve the issue of 'test_elementwise_mul_op_metax' failure

* [Metax] update repeat_interleave kernel & ignore max op test

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax] chang patch fix copy

* [metax] chang patch fix copy

* [Metax] update metax_gpu unit test

* [Metax] fix test CMakeList.txt

* [metax]change_cupti_and_fix_softmax

* [metax]change_patch

* [metax]change_patch

* [metax] updata_qr_kernel

* [metax] updata_qr_kernel

* [Metax] fix cufft and fix some blas kernel apply

* [metax] fix bug

* [Metax] add github action

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]fix_code style and index_elementwise_put_kernel

---------

Co-authored-by: Mingkun.Zhang <2496808993@qq.com>
Co-authored-by: metax666 <metax_pde@outlook.com>
Co-authored-by: jiaxinWang-metax <189149612@qq.com>
Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Co-authored-by: chezhang <1376507468@qq.com>
Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com>
Co-authored-by: ZhouDuan <1184319564@qq.com>
---
 backends/metax_gpu/CMakeLists.txt             | 15 +++--
 ...ex_elementwise_put_grad_kernel_register.cu | 18 ++++-
 .../index_elementwise_put_kernel_register.cu  | 18 ++++-
 .../kernels/gpudnn/conv_kernel_register.cu    |  3 +-
 .../kernels/gpudnn/conv_transpose_kernel.cu   |  7 +-
 .../kernels/impl/warpctc_grad_kernel_impl.h   |  2 +-
 .../kernels/impl/warpctc_kernel_impl.h        | 67 +++++++++----------
 .../kernels/impl/warprnnt_kernel_impl.h       | 39 +++++------
 8 files changed, 103 insertions(+), 66 deletions(-)

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index 787aae13e40..f282a9fbf7c 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -666,7 +666,6 @@ file(
   # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/selected_rows/shape_kernel.cc
   # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/sparse/gpu/conv_kernel_igemm.cu
   # ############################################################################
-  # kernels/fusion kernels/selected_rows
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu
   # kernels/kps
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/kps/elementwise_kernel.cu
@@ -713,10 +712,7 @@ file(
   kernels/cuda_kernels/*.cc
   kernels/cuda_kernels/*.cu
   kernels/funcs/blas/*.cc
-  kernels/ernie_core/*.cu
-  kernels/ernie_core/rms_norm_kernel_register.cu
-  kernels/ernie_core/top_p_sampling_kernel_register.cu
-  kernels/ernie_core/fused_bias_act_kernel_register.cu)
+  kernels/ernie_core/*.cu)
 
 set(CUSTOM_DEVICE_SRCS ${CUDA_SRCS} ${CC_SRCS} ${ERNIE_CORE_SRCS})
 
@@ -735,8 +731,13 @@ add_library(
 
 target_include_directories(
   ${TARGET_NAME}
-  PRIVATE ${PADDLE_SOURCE_DIR} ${CMAKE_SOURCE_DIR} ${CMAKE_SOURCE_DIR}/kernels
-          ${CUDA_INCLUDE_DIRS} ${WARPCTC_INCLUDE_DIR} ${WARPRNNT_INCLUDE_DIR} ${PADDLE_SOURCE_DIR}/third_party/pybind/include
+  PRIVATE ${PADDLE_SOURCE_DIR}
+          ${CMAKE_SOURCE_DIR}
+          ${CMAKE_SOURCE_DIR}/kernels
+          ${CUDA_INCLUDE_DIRS}
+          ${WARPCTC_INCLUDE_DIR}
+          ${WARPRNNT_INCLUDE_DIR}
+          ${PADDLE_SOURCE_DIR}/third_party/pybind/include
           ${PADDLE_SOURCE_DIR}/paddle/phi/api/include/compat)
 
 target_link_libraries(
diff --git a/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_put_grad_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_put_grad_kernel_register.cu
index c8d69cecae1..f935014d17b 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_put_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_put_grad_kernel_register.cu
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/index_elementwise_put_grad_kernel.cu"  //NOLINT
 #include "paddle/phi/kernels/index_elementwise_put_grad_kernel.h"
-
 PD_CUSTOM_KERNEL_REGISTER(index_elementwise_put_grad,
                           metax_gpu,
                           ALL_LAYOUT,
@@ -31,3 +31,19 @@ PD_CUSTOM_KERNEL_REGISTER(index_elementwise_put_grad,
                           phi::dtype::bfloat16,
                           phi::dtype::complex<float>,
                           phi::dtype::complex<double>) {}
+PD_CUSTOM_KERNEL_REGISTER(index_elementwise_put_with_tensor_grad,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::IndexElementwisePutWithTensorGradKernel,
+                          bool,
+                          float,
+                          double,
+                          int,
+                          int8_t,
+                          int64_t,
+                          int16_t,
+                          uint8_t,
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
diff --git a/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_put_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_put_kernel_register.cu
index 391dd908a8d..533204b8102 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_put_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_put_kernel_register.cu
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/index_elementwise_put_kernel.cu"  //NOLINT
 #include "paddle/phi/kernels/index_elementwise_put_kernel.h"
-
 PD_CUSTOM_KERNEL_REGISTER(index_elementwise_put,
                           metax_gpu,
                           ALL_LAYOUT,
@@ -31,3 +31,19 @@ PD_CUSTOM_KERNEL_REGISTER(index_elementwise_put,
                           phi::dtype::bfloat16,
                           phi::dtype::complex<float>,
                           phi::dtype::complex<double>) {}
+PD_CUSTOM_KERNEL_REGISTER(index_elementwise_put_with_tensor,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::IndexElementwisePutWithTensorKernel,
+                          bool,
+                          float,
+                          double,
+                          int,
+                          int8_t,
+                          int64_t,
+                          int16_t,
+                          uint8_t,
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
diff --git a/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu b/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu
index bf129fed05c..0a83b504c76 100644
--- a/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu
+++ b/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu
@@ -81,7 +81,8 @@ void ConvCudnnKernelImplV7(const DenseTensor* transformed_input,
   args.cdesc.set(
       dtype, padding_common, strides, dilations, phi::AllowTF32Cudnn(), groups);
 #else
-  args.cdesc.set(dtype, padding_common, strides, dilations, phi::AllowTF32Cudnn());
+  args.cdesc.set(
+      dtype, padding_common, strides, dilations, phi::AllowTF32Cudnn());
 #endif
 
 #if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1)
diff --git a/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu b/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu
index 928201c705f..532b7af0db4 100644
--- a/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu
+++ b/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu
@@ -93,7 +93,12 @@ void ConvTransposeCudnnKernelImplV7(const DenseTensor* transformed_x,
   args.idesc.set(*transformed_out, iwo_groups);
   args.wdesc.set(*filter, layout_tensor, iwo_groups);
   args.odesc.set(*transformed_x, iwo_groups);
-  args.cdesc.set(dtype, padding_common, strides, dilations_, phi::AllowTF32Cudnn(), c_groups);
+  args.cdesc.set(dtype,
+                 padding_common,
+                 strides,
+                 dilations_,
+                 phi::AllowTF32Cudnn(),
+                 c_groups);
 
 #ifdef PADDLE_WITH_HIP
   SearchResult<miopenConvBwdDataAlgorithm_t> bwd_result;
diff --git a/backends/metax_gpu/kernels/impl/warpctc_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/warpctc_grad_kernel_impl.h
index dc9bc376e63..16b740d5523 100644
--- a/backends/metax_gpu/kernels/impl/warpctc_grad_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/warpctc_grad_kernel_impl.h
@@ -16,7 +16,6 @@
 
 #include <vector>
 
-#include "third_party/warpctc/include/ctc.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
@@ -24,6 +23,7 @@
 #include "paddle/phi/kernels/funcs/sequence_padding.h"
 #include "paddle/phi/kernels/funcs/sequence_scale.h"
 #include "paddle/utils/optional.h"
+#include "third_party/warpctc/include/ctc.h"
 
 namespace phi {
 
diff --git a/backends/metax_gpu/kernels/impl/warpctc_kernel_impl.h b/backends/metax_gpu/kernels/impl/warpctc_kernel_impl.h
index e0b15feca03..cb39a0171ba 100644
--- a/backends/metax_gpu/kernels/impl/warpctc_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/warpctc_kernel_impl.h
@@ -16,7 +16,6 @@
 
 #include <vector>
 
-#include "third_party/warpctc/include/ctc.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/lod_utils.h"
 #include "paddle/phi/core/tensor_utils.h"
@@ -25,6 +24,7 @@
 #include "paddle/phi/kernels/funcs/sequence_padding.h"
 #include "paddle/phi/kernels/funcs/sequence_scale.h"
 #include "paddle/utils/optional.h"
+#include "third_party/warpctc/include/ctc.h"
 
 namespace phi {
 
@@ -59,15 +59,15 @@ class ComputeCtcLossFunctor<Context, float> {
                          void* workspace,
                          ctcOptions options) {
     return compute_ctc_loss(activations,
-                                          gradients,
-                                          flat_labels,
-                                          label_lengths,
-                                          input_lengths,
-                                          static_cast<int>(alphabet_size),
-                                          static_cast<int>(minibatch),
-                                          costs,
-                                          workspace,
-                                          options);
+                            gradients,
+                            flat_labels,
+                            label_lengths,
+                            input_lengths,
+                            static_cast<int>(alphabet_size),
+                            static_cast<int>(minibatch),
+                            costs,
+                            workspace,
+                            options);
   }
 };
 
@@ -84,17 +84,16 @@ class ComputeCtcLossFunctor<Context, double> {
                          double* costs,
                          void* workspace,
                          ctcOptions options) {
-    return compute_ctc_loss_double(
-        activations,
-        gradients,
-        flat_labels,
-        label_lengths,
-        input_lengths,
-        static_cast<int>(alphabet_size),
-        static_cast<int>(minibatch),
-        costs,
-        workspace,
-        options);
+    return compute_ctc_loss_double(activations,
+                                   gradients,
+                                   flat_labels,
+                                   label_lengths,
+                                   input_lengths,
+                                   static_cast<int>(alphabet_size),
+                                   static_cast<int>(minibatch),
+                                   costs,
+                                   workspace,
+                                   options);
   }
 };
 
@@ -140,21 +139,19 @@ class WarpCTCFunctor {
     size_t workspace_bytes = 0;
     ctcStatus_t status = CTC_STATUS_UNKNOWN_ERROR;
     if (sizeof(T) == 4) {
-      status =
-          get_workspace_size(cpu_label_lengths,
-                                           cpu_input_lengths,
-                                           static_cast<int>(sequence_width),
-                                           static_cast<int>(num_sequences),
-                                           options_,
-                                           &workspace_bytes);
+      status = get_workspace_size(cpu_label_lengths,
+                                  cpu_input_lengths,
+                                  static_cast<int>(sequence_width),
+                                  static_cast<int>(num_sequences),
+                                  options_,
+                                  &workspace_bytes);
     } else {
-      status = get_workspace_size_double(
-          cpu_label_lengths,
-          cpu_input_lengths,
-          static_cast<int>(sequence_width),
-          static_cast<int>(num_sequences),
-          options_,
-          &workspace_bytes);
+      status = get_workspace_size_double(cpu_label_lengths,
+                                         cpu_input_lengths,
+                                         static_cast<int>(sequence_width),
+                                         static_cast<int>(num_sequences),
+                                         options_,
+                                         &workspace_bytes);
     }
     PADDLE_ENFORCE_EQ(
         CTC_STATUS_SUCCESS,
diff --git a/backends/metax_gpu/kernels/impl/warprnnt_kernel_impl.h b/backends/metax_gpu/kernels/impl/warprnnt_kernel_impl.h
index 457fdcb9bff..8e3ab6fcdac 100644
--- a/backends/metax_gpu/kernels/impl/warprnnt_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/warprnnt_kernel_impl.h
@@ -16,12 +16,12 @@
 
 #include <vector>
 
-#include "third_party/warprnnt/include/rnnt.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "third_party/warprnnt/include/rnnt.h"
 
 namespace phi {
 
@@ -56,15 +56,15 @@ class ComputeRnntLossFunctor<Context, float> {
                           void* workspace,
                           rnntOptions options) {
     return compute_rnnt_loss(activations,
-                                           gradients,
-                                           label,
-                                           label_lengths,
-                                           input_lengths,
-                                           static_cast<int>(alphabet_size),
-                                           static_cast<int>(minibatch),
-                                           costs,
-                                           workspace,
-                                           options);
+                             gradients,
+                             label,
+                             label_lengths,
+                             input_lengths,
+                             static_cast<int>(alphabet_size),
+                             static_cast<int>(minibatch),
+                             costs,
+                             workspace,
+                             options);
   }
 };
 
@@ -82,15 +82,15 @@ class ComputeRnntLossFunctor<Context, double> {
                           void* workspace,
                           rnntOptions options) {
     return compute_rnnt_loss_fp64(activations,
-                                                gradients,
-                                                label,
-                                                label_lengths,
-                                                input_lengths,
-                                                static_cast<int>(alphabet_size),
-                                                static_cast<int>(minibatch),
-                                                costs,
-                                                workspace,
-                                                options);
+                                  gradients,
+                                  label,
+                                  label_lengths,
+                                  input_lengths,
+                                  static_cast<int>(alphabet_size),
+                                  static_cast<int>(minibatch),
+                                  costs,
+                                  workspace,
+                                  options);
   }
 };
 
@@ -117,6 +117,7 @@ class WarpRNNTFunctor {
    * \param blank             blank label used in rnnt loss function.
    * \param cpu_loss         loss of each example in CPU memory.
    */
+
   void operator()(const Context& dev_ctx,
                   const T* input,
                   T* gradient,

From a513aaeb4c895177cd1c6b91d8d3b3c6b8ffe5a6 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Wed, 17 Sep 2025 11:07:44 +0800
Subject: [PATCH 021/121] change_build_917 (#29)

* [Metax_change_ut]

* fix sum&collect_fpn_proposals op register

* modify profile

* [Metax] fix paddle bug replace 'MoeGradDispatchKernel' to 'MoeGateDispatchKernel'

* [Metax] register bce_loss_grad & bce_loss & index_add_grad kernels

* [Metax] con2d_grad use gpudnn

* blas handle support

* [Metax] register some kernels & update CMakeLists

* [Metax] fix metax unittest fail

* [Metax] add group_norm & label_smooth kernel and update matmul kernel

* [Metax] fix rmsprop kernel register and add meshgrid & meshgrid_grad kernel register

* add test

* add test

* [test]  chang the logic of workspace_host in cholesky_kernel_register

alloc(cpuplace,size), test pass
alloc(cpuplace, size, stream), crash

* [Metax] fix compile fail

* Revert "[Metax] fix compile fail"

This reverts commit 83bc87f686227962b0262e044225c6ed5507b824.

* [Metax] fix compile fail by 'conv_transpose_grad_kernel_impl.h'

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] con2d_grad use gpudnn

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] change_patch

* [Metax] update unit test CMakeLists.txt

* [Metax] update unit test CMakeLists.txt

* [feature] add unique_consecutive kernel

* [metax] add some kernel

* [metax] add some kernel

* [Metax] register baddbmm kernel & update blas api

* [Metax] register baddbmm kernel & update blas api

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [feature]  add add unique_consecutive kernel.cu

* [fix] fix some test case due to missing op register

* [fix]  fix some fail text

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax]fix lu eigvalshsqueeze rnn kernel

* add and fix some kernels

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [Metax] fix conflict

* [Metax] adapt to paddle-cpu-20250901 & resolve the issue of 'test_elementwise_mul_op_metax' failure

* [Metax] update repeat_interleave kernel & ignore max op test

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax] chang patch fix copy

* [metax] chang patch fix copy

* [Metax] update metax_gpu unit test

* [Metax] fix test CMakeList.txt

* [metax]change_cupti_and_fix_softmax

* [metax]change_patch

* [metax]change_patch

* [metax] updata_qr_kernel

* [metax] updata_qr_kernel

* [Metax] fix cufft and fix some blas kernel apply

* [metax] fix bug

* [Metax] add github action

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]fix_code style and index_elementwise_put_kernel

* [metax]change_build

---------

Co-authored-by: Mingkun.Zhang <2496808993@qq.com>
Co-authored-by: metax666 <metax_pde@outlook.com>
Co-authored-by: jiaxinWang-metax <189149612@qq.com>
Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Co-authored-by: chezhang <1376507468@qq.com>
Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com>
Co-authored-by: ZhouDuan <1184319564@qq.com>
---
 backends/metax_gpu/build.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/backends/metax_gpu/build.sh b/backends/metax_gpu/build.sh
index e3c4304e5f8..2bee14930a3 100755
--- a/backends/metax_gpu/build.sh
+++ b/backends/metax_gpu/build.sh
@@ -24,14 +24,14 @@ pip  uninstall paddlepaddle -y
 git submodule sync --recursive && git submodule update --init --recursive
 
 
-export http_proxy=https://172.17.0.1:1080 https_proxy=http://10.2.192.21:1080
-export
+# export http_proxy=https://172.17.0.1:1080 https_proxy=http://10.2.192.21:1080
+# export
 pip install safetensors==0.6.2 -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple some-package
 # install paddle
 python -m pip install --pre paddlepaddle -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/
 
 
-unset http_proxy https_proxy
+# unset http_proxy https_proxy
 
 # apply patch
 bash change_patch.sh

From 4eb455e0f14f4a74bfd91e3fd44d67500af2a2c0 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Wed, 17 Sep 2025 11:19:49 +0800
Subject: [PATCH 022/121] chang_build (#30)

* [Metax_change_ut]

* fix sum&collect_fpn_proposals op register

* modify profile

* [Metax] fix paddle bug replace 'MoeGradDispatchKernel' to 'MoeGateDispatchKernel'

* [Metax] register bce_loss_grad & bce_loss & index_add_grad kernels

* [Metax] con2d_grad use gpudnn

* blas handle support

* [Metax] register some kernels & update CMakeLists

* [Metax] fix metax unittest fail

* [Metax] add group_norm & label_smooth kernel and update matmul kernel

* [Metax] fix rmsprop kernel register and add meshgrid & meshgrid_grad kernel register

* add test

* add test

* [test]  chang the logic of workspace_host in cholesky_kernel_register

alloc(cpuplace,size), test pass
alloc(cpuplace, size, stream), crash

* [Metax] fix compile fail

* Revert "[Metax] fix compile fail"

This reverts commit 83bc87f686227962b0262e044225c6ed5507b824.

* [Metax] fix compile fail by 'conv_transpose_grad_kernel_impl.h'

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] con2d_grad use gpudnn

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] change_patch

* [Metax] update unit test CMakeLists.txt

* [Metax] update unit test CMakeLists.txt

* [feature] add unique_consecutive kernel

* [metax] add some kernel

* [metax] add some kernel

* [Metax] register baddbmm kernel & update blas api

* [Metax] register baddbmm kernel & update blas api

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [feature]  add add unique_consecutive kernel.cu

* [fix] fix some test case due to missing op register

* [fix]  fix some fail text

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax]fix lu eigvalshsqueeze rnn kernel

* add and fix some kernels

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [Metax] fix conflict

* [Metax] adapt to paddle-cpu-20250901 & resolve the issue of 'test_elementwise_mul_op_metax' failure

* [Metax] update repeat_interleave kernel & ignore max op test

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax] chang patch fix copy

* [metax] chang patch fix copy

* [Metax] update metax_gpu unit test

* [Metax] fix test CMakeList.txt

* [metax]change_cupti_and_fix_softmax

* [metax]change_patch

* [metax]change_patch

* [metax] updata_qr_kernel

* [metax] updata_qr_kernel

* [Metax] fix cufft and fix some blas kernel apply

* [metax] fix bug

* [Metax] add github action

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]fix_code style and index_elementwise_put_kernel

* [metax]change_build

* [metax]change_build

---------

Co-authored-by: Mingkun.Zhang <2496808993@qq.com>
Co-authored-by: metax666 <metax_pde@outlook.com>
Co-authored-by: jiaxinWang-metax <189149612@qq.com>
Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Co-authored-by: chezhang <1376507468@qq.com>
Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com>
Co-authored-by: ZhouDuan <1184319564@qq.com>
---
 backends/metax_gpu/build.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/backends/metax_gpu/build.sh b/backends/metax_gpu/build.sh
index 2bee14930a3..de409153472 100755
--- a/backends/metax_gpu/build.sh
+++ b/backends/metax_gpu/build.sh
@@ -22,12 +22,16 @@ pip  uninstall paddlepaddle -y
 
 # init paddle
 git submodule sync --recursive && git submodule update --init --recursive
+sleep 1000000
+unset http_proxy https_proxy
 
 
 # export http_proxy=https://172.17.0.1:1080 https_proxy=http://10.2.192.21:1080
 # export
 pip install safetensors==0.6.2 -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple some-package
 # install paddle
+
+
 python -m pip install --pre paddlepaddle -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/
 
 
From 1773978409b36845416e6491a6b5a2e06ff49992 Mon Sep 17 00:00:00 2001
From: jxwangmetax <189149612@qq.com>
Date: Wed, 17 Sep 2025 13:59:58 +0800
Subject: [PATCH 023/121] [metax]modify kernel (#31)

* modify cmake for warpctc and warprnnt

* modify conv for tf32 and fp32

* modify conv kernel

* modify library to static library

* modify kernel
---
 backends/metax_gpu/patch/paddle.patch | 257 ++++++++++++++------------
 1 file changed, 138 insertions(+), 119 deletions(-)

diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch
index e56826c4f3e..667d9f75d1c 100755
--- a/backends/metax_gpu/patch/paddle.patch
+++ b/backends/metax_gpu/patch/paddle.patch
@@ -16,16 +16,16 @@ index cfada544d4..a690e97d74 100644
 -  set(EIGEN_PATCH_COMMAND ${EIGEN_PATCH_COMMAND} && git apply ${complex_header})
 +  # set(EIGEN_PATCH_COMMAND ${EIGEN_PATCH_COMMAND} && git apply ${complex_header})
  endif()
-
+ 
  set(EIGEN_INCLUDE_DIR ${SOURCE_DIR})
 diff --git a/paddle/fluid/platform/profiler/cupti_data_process.cc b/paddle/fluid/platform/profiler/cupti_data_process.cc
 index bff0f2bf70..9376b5781f 100644
 --- a/paddle/fluid/platform/profiler/cupti_data_process.cc
 +++ b/paddle/fluid/platform/profiler/cupti_data_process.cc
 @@ -16,7 +16,7 @@
-
+ 
  #include <cstdio>
-
+ 
 -#include "paddle/fluid/platform/enforce.h"
 +// #include "paddle/fluid/platform/enforce.h"
  #include "paddle/phi/core/os_info.h"
@@ -76,7 +76,7 @@ index c0080f0a5e..458ca3e2e8 100644
 +  __macro(cudnnDestroyActivationDescriptor);               \
 +  __macro(cudnnSetRNNDescriptor_v6);
  CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-
+ 
  #if CUDNN_VERSION >= 7000 && CUDNN_VERSION < 8000
 @@ -152,7 +161,12 @@ CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
  #define CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(__macro) \
@@ -91,11 +91,11 @@ index c0080f0a5e..458ca3e2e8 100644
 +  __macro(cudnnRNNForwardInferenceEx);
  CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
  #endif
-
+ 
 @@ -195,40 +209,6 @@ CUDNN_DNN_ROUTINE_EACH_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
  CUDNN_DNN_ROUTINE_EACH_FRONTEND(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
  #endif
-
+ 
 -#if CUDNN_VERSION < 90000
 -#define CUDNN_DNN_ROUTINE_EACH_REMOVED_IN_E9(__macro) \
 -  __macro(cudnnGetRNNParamsSize);                     \
@@ -132,15 +132,15 @@ index c0080f0a5e..458ca3e2e8 100644
 -#endif
  }  // namespace dynload
  }  // namespace phi
-
+ 
 diff --git a/paddle/phi/backends/dynload/cufft.h b/paddle/phi/backends/dynload/cufft.h
-index 1547909d92..66b2779392 100644
+index 1547909d92..ef20838434 100644
 --- a/paddle/phi/backends/dynload/cufft.h
 +++ b/paddle/phi/backends/dynload/cufft.h
 @@ -1,3 +1,4 @@
 +// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved.
  /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
+ 
  Licensed under the Apache License, Version 2.0 (the "License");
 @@ -40,7 +41,9 @@ extern void EnforceCUFFTLoaded(const char* fn_name);
          cufft_dso_handle = phi::dynload::GetCUFFTDsoHandle();        \
@@ -160,23 +160,23 @@ index 59e92955c9..d2f8c2da15 100644
 @@ -24,8 +24,8 @@ limitations under the License. */
  #include "paddle/phi/backends/dynload/dynamic_loader.h"
  #include "paddle/phi/common/port.h"
-
+ 
 -namespace phi {
 -namespace dynload {
 +// namespace phi {
 +// namespace dynload {
-
+ 
  extern std::once_flag cupti_dso_flag;
  extern void *cupti_dso_handle;
 @@ -71,7 +71,7 @@ extern void *cupti_dso_handle;
  CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP);
-
+ 
  #undef DECLARE_DYNAMIC_LOAD_CUPTI_WRAP
 -}  // namespace dynload
 -}  // namespace phi
 +// }  // namespace dynload
 +// }  // namespace phi
-
+ 
 -#endif  // PADDLE_WITH_CUPTI
 +#endif  // PADDLE_WITH_CUPTI
 \ No newline at end of file
@@ -226,32 +226,32 @@ index c5309e7e11..3328571380 100644
      }                                                              \
    };                                                               \
 diff --git a/paddle/phi/backends/gpu/cuda/cuda_device_function.h b/paddle/phi/backends/gpu/cuda/cuda_device_function.h
-index 4ff2e528a9..81421c8ca1 100644
+index 4ff2e528a9..23f7f4b583 100644
 --- a/paddle/phi/backends/gpu/cuda/cuda_device_function.h
 +++ b/paddle/phi/backends/gpu/cuda/cuda_device_function.h
 @@ -1,3 +1,4 @@
 +// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved.
  /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
+ 
  Licensed under the Apache License, Version 2.0 (the "License");
 @@ -25,7 +26,7 @@ namespace phi {
  namespace backends {
  namespace gpu {
-
+ 
 -#define FULL_WARP_MASK 0xFFFFFFFF
 +#define FULL_WARP_MASK 0xFFFFFFFFFFFFFFFFULL
  #define CREATE_SHFL_MASK(mask, predicate) \
    mask = __ballot_sync(FULL_WARP_MASK, (predicate))
-
+ 
 @@ -45,12 +46,12 @@ namespace gpu {
-
+ 
  template <typename T>
  __forceinline__ __device__ T
 -CudaShuffleDownSync(unsigned mask, T val, int delta, int width = warpSize) {
 +CudaShuffleDownSync(unsigned long long mask, T val, int delta, int width = warpSize) {
    return __shfl_down_sync(mask, val, static_cast<unsigned>(delta), width);
  }
-
+ 
  template <typename T>
 -__forceinline__ __device__ T CudaShuffleXorSync(unsigned mask,
 +__forceinline__ __device__ T CudaShuffleXorSync(unsigned long long mask,
@@ -259,7 +259,7 @@ index 4ff2e528a9..81421c8ca1 100644
                                                  int width = warpSize) {
    return __shfl_xor_sync(mask, val, width);
 @@ -58,14 +59,14 @@ __forceinline__ __device__ T CudaShuffleXorSync(unsigned mask,
-
+ 
  template <>
  __forceinline__ __device__ phi::dtype::float16 CudaShuffleDownSync(
 -    unsigned mask, phi::dtype::float16 val, int delta, int width) {
@@ -267,7 +267,7 @@ index 4ff2e528a9..81421c8ca1 100644
    return phi::dtype::float16(__shfl_down_sync(
        mask, val.to_half(), static_cast<unsigned>(delta), width));
  }
-
+ 
  template <>
  __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleDownSync(
 -    unsigned mask, phi::dtype::bfloat16 val, int delta, int width) {
@@ -276,7 +276,7 @@ index 4ff2e528a9..81421c8ca1 100644
    return phi::dtype::bfloat16(__shfl_down_sync(
        mask, val.to_nv_bfloat16(), static_cast<unsigned>(delta), width));
 @@ -77,7 +78,7 @@ __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleDownSync(
-
+ 
  template <>
  __forceinline__ __device__ phi::dtype::complex<float> CudaShuffleDownSync(
 -    unsigned mask, phi::dtype::complex<float> val, int delta, int width) {
@@ -285,7 +285,7 @@ index 4ff2e528a9..81421c8ca1 100644
        mask, static_cast<float>(val.real), static_cast<unsigned>(delta), width));
    float imag = static_cast<float>(__shfl_down_sync(
 @@ -87,7 +88,7 @@ __forceinline__ __device__ phi::dtype::complex<float> CudaShuffleDownSync(
-
+ 
  template <>
  __forceinline__ __device__ phi::dtype::complex<double> CudaShuffleDownSync(
 -    unsigned mask, phi::dtype::complex<double> val, int delta, int width) {
@@ -294,14 +294,14 @@ index 4ff2e528a9..81421c8ca1 100644
        static_cast<double>(__shfl_down_sync(mask,
                                             static_cast<double>(val.real),
 @@ -103,13 +104,13 @@ __forceinline__ __device__ phi::dtype::complex<double> CudaShuffleDownSync(
-
+ 
  template <>
  __forceinline__ __device__ phi::dtype::float16 CudaShuffleXorSync(
 -    unsigned mask, phi::dtype::float16 val, int width) {
 +    unsigned long long mask, phi::dtype::float16 val, int width) {
    return phi::dtype::float16(__shfl_xor_sync(mask, val.to_half(), width));
  }
-
+ 
  template <>
  __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleXorSync(
 -    unsigned mask, phi::dtype::bfloat16 val, int width) {
@@ -310,7 +310,7 @@ index 4ff2e528a9..81421c8ca1 100644
    return phi::dtype::bfloat16(
        __shfl_xor_sync(mask, val.to_nv_bfloat16(), width));
 @@ -121,7 +122,7 @@ __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleXorSync(
-
+ 
  template <>
  __forceinline__ __device__ phi::dtype::complex<float> CudaShuffleXorSync(
 -    unsigned mask, phi::dtype::complex<float> val, int width) {
@@ -319,7 +319,7 @@ index 4ff2e528a9..81421c8ca1 100644
        __shfl_xor_sync(mask, static_cast<float>(val.real), width));
    float imag = static_cast<float>(
 @@ -131,7 +132,7 @@ __forceinline__ __device__ phi::dtype::complex<float> CudaShuffleXorSync(
-
+ 
  template <>
  __forceinline__ __device__ phi::dtype::complex<double> CudaShuffleXorSync(
 -    unsigned mask, phi::dtype::complex<double> val, int width) {
@@ -328,14 +328,14 @@ index 4ff2e528a9..81421c8ca1 100644
        __shfl_xor_sync(mask, static_cast<double>(val.real), width));
    double imag = static_cast<double>(
 @@ -141,7 +142,7 @@ __forceinline__ __device__ phi::dtype::complex<double> CudaShuffleXorSync(
-
+ 
  template <typename T>
  __forceinline__ __device__ T
 -CudaShuffleSync(unsigned mask, T val, int src_line, int width = 32) {
 +CudaShuffleSync(unsigned long long mask, T val, int src_line, int width = 32) {
    return __shfl_sync(mask, val, src_line, width);
  }
-
+ 
 @@ -160,7 +161,7 @@ __device__ T reduceSum(T val, int tid, int len) {
    // but most card's warp size is 32.
    const int warpSize = 32;
@@ -343,7 +343,7 @@ index 4ff2e528a9..81421c8ca1 100644
 -  unsigned mask = 0u;
 +  unsigned long long mask = 0ull;
    CREATE_SHFL_MASK(mask, tid < len);
-
+ 
    for (int offset = warpSize / 2; offset > 0; offset /= 2)
 diff --git a/paddle/phi/core/enforce.h b/paddle/phi/core/enforce.h
 index 024a7de73e..1e4cdf16be 100644
@@ -351,7 +351,7 @@ index 024a7de73e..1e4cdf16be 100644
 +++ b/paddle/phi/core/enforce.h
 @@ -45,7 +45,9 @@ limitations under the License. */
  #endif
-
+ 
  #ifdef PADDLE_WITH_CUDA
 -#include "paddle/phi/backends/dynload/cublas.h"
 +// #include "paddle/phi/backends/dynload/../../../../../cublas.h"
@@ -361,9 +361,9 @@ index 024a7de73e..1e4cdf16be 100644
  #include "paddle/phi/backends/dynload/curand.h"
  #include "paddle/phi/backends/dynload/cusolver.h"
 @@ -97,7 +99,7 @@ inline bool is_error(bool stat) { return !stat; }
-
+ 
  void ThrowWarnInternal(const std::string& message);
-
+ 
 -#if defined(__CUDA_ARCH__)
 +#if defined(__CUDACC__)
  // For cuda, the assertions can affect performance and it is therefore
@@ -379,7 +379,7 @@ index 024a7de73e..1e4cdf16be 100644
    } while (0)
  #elif defined(__HIPCC__)
 @@ -757,4 +759,4 @@ inline void retry_sleep(unsigned millisecond) {
-
+ 
  }  // namespace enforce
  using namespace enforce;  // NOLINT
 -}  // namespace phi
@@ -392,7 +392,7 @@ index c646e487d0..325122175c 100644
 @@ -25,8 +25,9 @@
  #else
  #include <cuda_runtime.h>
-
+ 
 -#include "paddle/phi/backends/dynload/cublas.h"
 -#include "paddle/phi/backends/dynload/cublasLt.h"
 +// #include "paddle/phi/backends/dynload/cublas.h"
@@ -400,16 +400,16 @@ index c646e487d0..325122175c 100644
 +// #include "paddle/phi/backends/dynload/cublasLt.h"
  #include "paddle/phi/backends/dynload/cudnn.h"
  #endif
-
+ 
 @@ -90,7 +91,7 @@ DECLARE_TYPE_FOR_GPU(gpuStreamCaptureMode,
-
+ 
  // TODO(Ming Huang): Since there is no blasLt handler,
  // use rocblas_handle for workaround.
 -DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle);
 +// DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle);
-
+ 
  #undef DECLARE_TYPE_FOR_GPU
-
+ 
 diff --git a/paddle/phi/core/platform/device_context.h b/paddle/phi/core/platform/device_context.h
 index 2d02eb370b..8a7233e34e 100644
 --- a/paddle/phi/core/platform/device_context.h
@@ -430,58 +430,58 @@ index d69eb67d6f..1d8b6e9375 100644
 --- a/paddle/phi/kernels/cpu/index_select_impl.h
 +++ b/paddle/phi/kernels/cpu/index_select_impl.h
 @@ -18,7 +18,7 @@
-
+ 
  #include "paddle/phi/core/dense_tensor.h"
  #include "paddle/phi/core/tensor_utils.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/eigen/common.h"
  #include "paddle/phi/kernels/funcs/math_function.h"
-
+ 
 diff --git a/paddle/phi/kernels/funcs/fc_functor.cu b/paddle/phi/kernels/funcs/fc_functor.cu
 index cb35feee32..64f5bd24ac 100644
 --- a/paddle/phi/kernels/funcs/fc_functor.cu
 +++ b/paddle/phi/kernels/funcs/fc_functor.cu
 @@ -16,12 +16,12 @@ limitations under the License. */
-
+ 
  #include "paddle/phi/backends/all_context.h"
  #include "paddle/phi/kernels/funcs/aligned_vector.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/fc_functor.h"
-
+ 
  #include "paddle/phi/backends/gpu/gpu_launch_config.h"
  #include "paddle/phi/core/dense_tensor.h"
 -#include "paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h"
 +// #include "paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h"
  #include "paddle/phi/kernels/funcs/quant_dequant.h"
  #include "paddle/phi/kernels/matmul_kernel.h"
-
+ 
 diff --git a/paddle/phi/kernels/funcs/gru_compute.cu b/paddle/phi/kernels/funcs/gru_compute.cu
 index 88663ec880..98b93072a3 100644
 --- a/paddle/phi/kernels/funcs/gru_compute.cu
 +++ b/paddle/phi/kernels/funcs/gru_compute.cu
 @@ -12,7 +12,7 @@ limitations under the License. */
  #include "paddle/phi/kernels/funcs/gru_compute.h"
-
+ 
  #include "paddle/phi/backends/gpu/gpu_context.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h"
  #include "paddle/phi/kernels/funcs/detail/gru_kernel.h"
-
+ 
 diff --git a/paddle/phi/kernels/funcs/math/context_project.h b/paddle/phi/kernels/funcs/math/context_project.h
 index 15e1a4a3c3..e4780538d7 100644
 --- a/paddle/phi/kernels/funcs/math/context_project.h
 +++ b/paddle/phi/kernels/funcs/math/context_project.h
 @@ -18,7 +18,7 @@
  #include <vector>
-
+ 
  #include "paddle/phi/core/tensor_utils.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/im2col.h"
-
+ 
  namespace phi {
 diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cu b/paddle/phi/kernels/funcs/matrix_inverse.cu
 index e101224970..a52eb6096f 100644
@@ -489,14 +489,14 @@ index e101224970..a52eb6096f 100644
 +++ b/paddle/phi/kernels/funcs/matrix_inverse.cu
 @@ -15,11 +15,13 @@ limitations under the License. */
  #include "paddle/phi/kernels/funcs/matrix_inverse.h"
-
+ 
  #include "paddle/phi/common/memory_utils.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
-
+ 
  namespace phi {
  namespace funcs {
-
+ 
 +
 +
  template <typename Context, typename T>
@@ -514,28 +514,28 @@ index 558d363b39..05da04b517 100644
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/math_function.h"
  #include "paddle/phi/kernels/funcs/scatter.cu.h"
-
+ 
 diff --git a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
 index 8b0baf5f5f..260482f124 100644
 --- a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
 +++ b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
 @@ -27,7 +27,7 @@ namespace cub = hipcub;
-
+ 
  #include "paddle/phi/kernels/funcs/multihead_matmul_functor.h"
-
+ 
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/math_cuda_utils.h"
-
+ 
  namespace phi {
 diff --git a/paddle/phi/kernels/funcs/top_k_function_cuda.h b/paddle/phi/kernels/funcs/top_k_function_cuda.h
-index e30d440ff3..3c74792690 100644
+index e30d440ff3..108edda7ca 100644
 --- a/paddle/phi/kernels/funcs/top_k_function_cuda.h
 +++ b/paddle/phi/kernels/funcs/top_k_function_cuda.h
 @@ -30,11 +30,11 @@ limitations under the License. */
  #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
  #include "paddle/phi/kernels/primitive/functor_primitives.h"
-
+ 
 -#define FINAL_MASK 0xffffffff
 +#define FINAL_MASK 0xffffffffffffffffull
  #ifdef PADDLE_WITH_HIP
@@ -545,7 +545,7 @@ index e30d440ff3..3c74792690 100644
 +#define WARP_SIZE 64
  #endif
  #define MAX_NUM_THREADS 1024
-
+ 
 @@ -196,21 +196,56 @@ __device__ __forceinline__ void AddTo(Pair<T> topk[],
    for (int k = beam_size - 2; k >= 0; k--) {
      if (largest) {
@@ -606,7 +606,7 @@ index e30d440ff3..3c74792690 100644
 +  topk[0 + offset].v = p.v;
 +  topk[0 + offset].id = p.id;
  }
-
+ 
  template <typename T, int BlockSize>
 @@ -239,24 +274,24 @@ __device__ __forceinline__ void GetTopK(Pair<T> topk[],
  template <typename T, int BlockSize>
@@ -662,7 +662,7 @@ index e30d440ff3..3c74792690 100644
 +            // topk + MaxLength - *beam, src, tid, dim, *max, length, largest);
        }
      }
-
+ 
 @@ -355,6 +394,8 @@ __device__ __forceinline__ void BlockReduce(Pair<T> shared_max[],
        shared_max[wid] = input_now;
      }
@@ -697,7 +697,7 @@ index e30d440ff3..3c74792690 100644
 -    if (--(*k) == 0) break;
 +    // if (--(*k) == 0) break;
 +    unsigned long long mask = 0ull;
-
+ 
 -    unsigned mask = 0u;
 +    // unsigned mask = 0u;
      CREATE_SHFL_MASK(mask, true);
@@ -721,7 +721,7 @@ index e30d440ff3..3c74792690 100644
 +
      return ret;
    }
-
+ 
    static __device__ __forceinline__ unsigned int SetBitfield(
        unsigned int val, unsigned int to_insert, int pos, int len) {
      unsigned int ret;
@@ -743,7 +743,7 @@ index e30d440ff3..3c74792690 100644
 +    ret = (static_cast<uint64_t>(val) << (64 - pos - len)) >> (64 - len);
      return ret;
    }
-
+ 
 @@ -507,9 +556,9 @@ struct Bitfield<uint64_t> {
                                                           int pos,
                                                           int len) {
@@ -771,7 +771,7 @@ index e30d440ff3..3c74792690 100644
 +  return ::__lane_id();
 +  // return lane_id;
  }
-
+ 
  __device__ __forceinline__ unsigned GetLaneMaskLe() {
    unsigned mask;
 -  asm("mov.u32 %0, %%lanemask_le;" : "=r"(mask));
@@ -780,17 +780,17 @@ index e30d440ff3..3c74792690 100644
 +  return ((uint64_t(1) << ::__lane_id()) << 1) - 1;
 +  // return mask;
  }
-
+ 
  template <typename T, bool KillDependency, class Function>
 @@ -881,7 +936,8 @@ __global__ void GatherKthValue(const T* input,
-
+ 
    // 1. Find the k-th value
    T kth_value = static_cast<T>(0);
 -  RadixSearch<T, RadixTypeConfig<T>::RadixType, IndexType, false>(
 +  // RadixSearch<T, RadixTypeConfig<T>::RadixType, IndexType, false>(
 +  RadixSearch<T, typename RadixTypeConfig<T>::RadixType, IndexType, false>(
        cur_input, k, num_cols, shared_mem, &kth_value);
-
+ 
    __shared__ int64_t block_min_idx;
 @@ -1314,3 +1370,4 @@ bool SortTopk(const phi::GPUContext& dev_ctx,
  }
@@ -803,12 +803,12 @@ index 32db61532f..0220316bc3 100644
 +++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h
 @@ -15,7 +15,7 @@
  #pragma once
-
+ 
  #if defined(PADDLE_WITH_CUDA)
 -#include "paddle/phi/backends/dynload/cublasLt.h"
 +// #include "paddle/phi/backends/dynload/cublasLt.h"
  #endif
-
+ 
  #include "glog/logging.h"
 diff --git a/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h b/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h
 index 9d4bb18d55..ea42cc10a9 100644
@@ -830,12 +830,12 @@ index b8cfdbf3ce..fa14b94a77 100644
 --- a/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu
 +++ b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu
 @@ -14,7 +14,7 @@
-
+ 
  #include "paddle/phi/core/kernel_registry.h"
  #include "paddle/phi/kernels/funcs/aligned_vector.h"
 -#include "paddle/phi/kernels/fusion/gpu/mmha_util.cu.h"
 +#include "kernels/metax_kernel/mmha_util.cu.h"
-
+ 
  namespace phi {
  namespace fusion {
 diff --git a/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu b/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu
@@ -843,14 +843,27 @@ index e838778952..83e805e75a 100644
 --- a/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu
 +++ b/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu
 @@ -14,7 +14,7 @@
-
+ 
  #include "paddle/phi/core/kernel_registry.h"
  #include "paddle/phi/kernels/funcs/aligned_vector.h"
 -#include "paddle/phi/kernels/fusion/gpu/mmha_util.cu.h"
 +#include "kernels/metax_kernel/mmha_util.cu.h"
-
+ 
  namespace phi {
  namespace fusion {
+diff --git a/paddle/phi/kernels/gpu/correlation_kernel.cu b/paddle/phi/kernels/gpu/correlation_kernel.cu
+index 4c93778bde..c7bdf8a2cc 100644
+--- a/paddle/phi/kernels/gpu/correlation_kernel.cu
++++ b/paddle/phi/kernels/gpu/correlation_kernel.cu
+@@ -103,7 +103,7 @@ void CorrelationCUDAKernel(const Context &dev_ctx,
+                            int stride2,
+                            int corr_type_multiply,
+                            DenseTensor *out) {
+-  bool is_gpu_place = dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU;
++  bool is_gpu_place = dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU || dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM;
+   PADDLE_ENFORCE_EQ(
+       is_gpu_place,
+       true,
 diff --git a/paddle/phi/kernels/gpu/depthwise_conv.h b/paddle/phi/kernels/gpu/depthwise_conv.h
 index f0cca0f701..02ea957240 100644
 --- a/paddle/phi/kernels/gpu/depthwise_conv.h
@@ -863,9 +876,22 @@ index f0cca0f701..02ea957240 100644
 -#include "paddle/phi/kernels/impl/conv_cudnn_impl.h"
 +#include "kernels/gpudnn/conv_gpudnn.h"
 +#include "kernels/impl/conv_cudnn_impl.h"
-
+ 
  namespace phi {
  // To determine use cudnn or not.
+diff --git a/paddle/phi/kernels/gpu/dgc_kernel.cu b/paddle/phi/kernels/gpu/dgc_kernel.cu
+index c2ddfa1347..c6adf5a6de 100644
+--- a/paddle/phi/kernels/gpu/dgc_kernel.cu
++++ b/paddle/phi/kernels/gpu/dgc_kernel.cu
+@@ -188,7 +188,7 @@ void DGCKernel(const Context& dev_ctx,
+   int buf_size = paddle::communication::dgc::get_buffer_size(k);
+   phi::Allocator::AllocationPtr tmp_ious_data;
+ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+-  if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) {
++  if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU || dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM) {
+     tmp_ious_data = phi::memory_utils::Alloc(
+         dev_ctx.GetPlace(),
+         buf_size,
 diff --git a/paddle/phi/kernels/gpu/gelu_funcs.h b/paddle/phi/kernels/gpu/gelu_funcs.h
 index 29fa252e96..4ae72b0935 100644
 --- a/paddle/phi/kernels/gpu/gelu_funcs.h
@@ -890,7 +916,7 @@ index 29fa252e96..4ae72b0935 100644
 +// #endif
    return tanhf(x);
  }
-
+ 
 diff --git a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu
 index 11efd87965..679db14c24 100644
 --- a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu
@@ -901,9 +927,9 @@ index 11efd87965..679db14c24 100644
  #include "paddle/phi/kernels/funcs/math_function.h"
 -#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
 +#include "kernels/gpudnn/softmax_gpudnn.h"
-
+ 
  namespace phi {
-
+ 
 diff --git a/paddle/phi/kernels/gpu/log_softmax_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_kernel.cu
 index 63c35dd4ee..15da9aea45 100644
 --- a/paddle/phi/kernels/gpu/log_softmax_kernel.cu
@@ -914,9 +940,9 @@ index 63c35dd4ee..15da9aea45 100644
  #include "paddle/phi/kernels/funcs/math_function.h"
 -#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
 +#include "kernels/gpudnn/softmax_gpudnn.h"
-
+ 
  namespace phi {
-
+ 
 diff --git a/paddle/phi/kernels/gpu/lstsq_kernel.cu b/paddle/phi/kernels/gpu/lstsq_kernel.cu
 index 1bdbe1564c..f753b54bc6 100644
 --- a/paddle/phi/kernels/gpu/lstsq_kernel.cu
@@ -930,6 +956,19 @@ index 1bdbe1564c..f753b54bc6 100644
  #include "paddle/phi/kernels/impl/qr_kernel_impl.h"
  #include "paddle/phi/kernels/impl/tril_triu_kernel_impl.h"
  #include "paddle/phi/kernels/lstsq_kernel.h"
+diff --git a/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu b/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu
+index 05a977828f..5136608c41 100644
+--- a/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu
++++ b/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu
+@@ -58,7 +58,7 @@ void ShuffleBatchKernel(const Context& dev_ctx,
+   int64_t seed_int = 0;
+   if (seed.initialized()) {
+     const auto& seed_place = seed.place().GetType();
+-    bool is_gpu_place = seed_place == phi::AllocationType::GPU;
++    bool is_gpu_place = seed_place == phi::AllocationType::GPU || seed_place == phi::AllocationType::CUSTOM;
+     if (is_gpu_place) {
+       // NOTE: We have overwritten GetKernelTypeForVar, so seed_place would
+       // not be CUDAPlace in practice. This case would only happen in Python
 diff --git a/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h b/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h
 index 9bc5326c90..79b57a8203 100644
 --- a/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h
@@ -948,7 +987,7 @@ index cf80666b4e..ca76e055fb 100644
 --- a/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h
 @@ -19,7 +19,7 @@ limitations under the License. */
-
+ 
  #include "paddle/phi/common/amp_type_traits.h"
  #include "paddle/phi/kernels/baddbmm_grad_kernel.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
@@ -961,14 +1000,14 @@ index 2789cb59a2..b91b076f7f 100644
 --- a/paddle/phi/kernels/impl/baddbmm_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/baddbmm_kernel_impl.h
 @@ -20,7 +20,7 @@ limitations under the License. */
-
+ 
  #include "paddle/phi/common/amp_type_traits.h"
  #include "paddle/phi/kernels/baddbmm_kernel.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/eigen/common.h"
  #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
-
+ 
 diff --git a/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h
 index 9a21c23666..86413d1577 100644
 --- a/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h
@@ -993,7 +1032,7 @@ index 4459a931da..837c8682b8 100644
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/deformable_conv_functor.h"
-
+ 
  namespace phi {
 diff --git a/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
 index ad9e9197dd..5478d9817d 100644
@@ -1013,31 +1052,31 @@ index e6b3960f6d..564125f1f6 100644
 --- a/paddle/phi/kernels/impl/gammaincc_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/gammaincc_kernel_impl.h
 @@ -56,8 +56,8 @@ HOSTDEVICE T igam(const T a, const T x) {
-
+ 
  template <typename T>
  HOSTDEVICE T igamc(const T a, const T x) {
 -  static T big = 4.503599627370496e15;
 -  static T biginv = 2.22044604925031308085e-16;
 +  const static T big = 4.503599627370496e15;
 +  const static T biginv = 2.22044604925031308085e-16;
-
+ 
    if ((x <= T{0}) || (a <= T{0})) return (T{1.0});
-
+ 
 diff --git a/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h b/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h
 index 410fb3c560..009ce03440 100644
 --- a/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h
 @@ -54,7 +54,7 @@ HOSTDEVICE T digamma_positive_domain(T x) {
-
+ 
  template <typename T>
  HOSTDEVICE T digamma(T x) {
 -  static T pi = T{3.14159265358979323846};
 +  const static T pi = T{3.14159265358979323846};
-
+ 
    if (x == T{0.0}) {
      T inf = std::numeric_limits<T>::infinity();
 diff --git a/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h b/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h
-index 5ebbc8d2db..48acf8d0cd 100644
+index 5ebbc8d2db..c7b6c338e2 100644
 --- a/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h
 @@ -15,8 +15,9 @@ limitations under the License. */
@@ -1049,11 +1088,11 @@ index 5ebbc8d2db..48acf8d0cd 100644
 +#include "kernels/funcs/blas/cublaslt.h"
 +#include "kernels/funcs/quant_dequant.h"
 +#include "kernels/metax_kernel/metax_context.h"
-
+ 
  #pragma once
-
+ 
 @@ -668,7 +669,7 @@ void LLMGemm(const phi::GPUContext& dev_ctx,
-
+ 
    {
      auto helper =
 -        std::make_unique<CublasLtHelper>(m, k, n, dev_ctx.cublaslt_handle());
@@ -1067,12 +1106,12 @@ index 1f319c4ae3..9186eb6906 100644
 +++ b/paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h
 @@ -15,7 +15,7 @@ limitations under the License. */
  #pragma once
-
+ 
  #include "paddle/phi/core/dense_tensor.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/matrix_inverse.h"
-
+ 
  namespace phi {
 diff --git a/paddle/phi/kernels/impl/matrix_power_kernel_impl.h b/paddle/phi/kernels/impl/matrix_power_kernel_impl.h
 index 6f03f76eeb..5fe2c3e7dc 100644
@@ -1080,13 +1119,13 @@ index 6f03f76eeb..5fe2c3e7dc 100644
 +++ b/paddle/phi/kernels/impl/matrix_power_kernel_impl.h
 @@ -15,7 +15,7 @@ limitations under the License. */
  #pragma once
-
+ 
  #include "paddle/phi/core/dense_tensor.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/for_range.h"
  #include "paddle/phi/kernels/funcs/matrix_inverse.h"
-
+ 
 diff --git a/paddle/phi/kernels/impl/merged_momentum_impl.h b/paddle/phi/kernels/impl/merged_momentum_impl.h
 index 7b85903776..3f4b298807 100644
 --- a/paddle/phi/kernels/impl/merged_momentum_impl.h
@@ -1118,31 +1157,11 @@ index 4099d8b506..baef2cd643 100644
 --- a/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h
 @@ -14,7 +14,7 @@
-
+ 
  #pragma once
-
+ 
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/eigen/common.h"
  #include "paddle/phi/kernels/funcs/math_function.h"
-
-diff --git a/third_party/flagcx b/third_party/flagcx
-index 7c469f4af9..7e6c4cc3ca 160000
---- a/third_party/flagcx
-+++ b/third_party/flagcx
-@@ -1 +1 @@
--Subproject commit 7c469f4af991bf0f64b8f76d66f8e307a5eaea3f
-+Subproject commit 7e6c4cc3cad3fce9b3dedfe46a9d195d616e8ffa
-diff --git a/third_party/flashattn b/third_party/flashattn
-index 581e48aa69..749aca3807 160000
---- a/third_party/flashattn
-+++ b/third_party/flashattn
-@@ -1 +1 @@
--Subproject commit 581e48aa693a17ec3676ec2715d46130310d318d
-+Subproject commit 749aca380794b472096d4e7ea01dd252ab0887c9
-diff --git a/third_party/yaml-cpp b/third_party/yaml-cpp
---- a/third_party/yaml-cpp
-+++ b/third_party/yaml-cpp
-@@ -1 +1 @@
--Subproject commit 1d8ca1f35eb3a9c9142462b28282a848e5d29a91
-+Subproject commit 1d8ca1f35eb3a9c9142462b28282a848e5d29a91-dirty
+ 

From 69af38186ebfd6029d6e5b1a057d6e8fa389ee08 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Wed, 17 Sep 2025 17:07:26 +0800
Subject: [PATCH 024/121] change_metax_work (#32)

* [Metax_change_ut]

* fix sum&collect_fpn_proposals op register

* modify profile

* [Metax] fix paddle bug replace 'MoeGradDispatchKernel' to 'MoeGateDispatchKernel'

* [Metax] register bce_loss_grad & bce_loss & index_add_grad kernels

* [Metax] con2d_grad use gpudnn

* blas handle support

* [Metax] register some kernels & update CMakeLists

* [Metax] fix metax unittest fail

* [Metax] add group_norm & label_smooth kernel and update matmul kernel

* [Metax] fix rmsprop kernel register and add meshgrid & meshgrid_grad kernel register

* add test

* add test

* [test]  chang the logic of workspace_host in cholesky_kernel_register

alloc(cpuplace,size), test pass
alloc(cpuplace, size, stream), crash

* [Metax] fix compile fail

* Revert "[Metax] fix compile fail"

This reverts commit 83bc87f686227962b0262e044225c6ed5507b824.

* [Metax] fix compile fail by 'conv_transpose_grad_kernel_impl.h'

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] con2d_grad use gpudnn

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] change_patch

* [Metax] update unit test CMakeLists.txt

* [Metax] update unit test CMakeLists.txt

* [feature] add unique_consecutive kernel

* [metax] add some kernel

* [metax] add some kernel

* [Metax] register baddbmm kernel & update blas api

* [Metax] register baddbmm kernel & update blas api

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [feature]  add add unique_consecutive kernel.cu

* [fix] fix some test case due to missing op register

* [fix]  fix some fail text

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax]fix lu eigvalshsqueeze rnn kernel

* add and fix some kernels

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [Metax] fix conflict

* [Metax] adapt to paddle-cpu-20250901 & resolve the issue of 'test_elementwise_mul_op_metax' failure

* [Metax] update repeat_interleave kernel & ignore max op test

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax] chang patch fix copy

* [metax] chang patch fix copy

* [Metax] update metax_gpu unit test

* [Metax] fix test CMakeList.txt

* [metax]change_cupti_and_fix_softmax

* [metax]change_patch

* [metax]change_patch

* [metax] updata_qr_kernel

* [metax] updata_qr_kernel

* [Metax] fix cufft and fix some blas kernel apply

* [metax] fix bug

* [Metax] add github action

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]fix_code style and index_elementwise_put_kernel

* [metax]change_build

* [metax]change_build

* change_metax_work

* change_metax_work

---------

Co-authored-by: Mingkun.Zhang <2496808993@qq.com>
Co-authored-by: metax666 <metax_pde@outlook.com>
Co-authored-by: jiaxinWang-metax <189149612@qq.com>
Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Co-authored-by: chezhang <1376507468@qq.com>
Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com>
Co-authored-by: ZhouDuan <1184319564@qq.com>
---
 .github/workflows/metax_work.yaml | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/metax_work.yaml b/.github/workflows/metax_work.yaml
index 0d3d2637cdd..c23112f0545 100644
--- a/.github/workflows/metax_work.yaml
+++ b/.github/workflows/metax_work.yaml
@@ -19,27 +19,28 @@ defaults:
 jobs:
   metax-gpu-test:
     runs-on: paddle-metax-runner-set
+    # runs-on: debug-paddle-runner-set
     steps:
       - name: Checkout repository
         run: |
           git config --global user.name "GitHub Actions"
           git config --global user.email "actions@github.com"
 
-          if [ "${{ github.event_name }}" == "pull_request" ]; then
-            BRANCH_NAME=${{ github.head_ref }}
-          else
-            BRANCH_NAME=${{ github.ref_name }}
-          fi
-
           git clone \
             --reference-if-able /home/runner/PaddleCustomDevice \
             --depth=1 \
             --shallow-submodules \
             --jobs=8 \
-            --branch $BRANCH_NAME \
+            --branch ${{ github.base_ref }} \
             --recurse-submodules \
             https://${{ github.actor }}:${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }}.git .
 
+          if [ "${{ github.event_name }}" == "pull_request" ]; then
+            git fetch origin pull/${{ github.event.pull_request.number }}/head:pull/${{ github.event.pull_request.number }}/head
+            git checkout pull/${{ github.event.pull_request.number }}/head
+            git submodule update --init --recursive
+          fi
+
 
       - name: compile
         run: |

From 7fe6f2dca92c3c0e3fb4c4ceb7f18a26560422e9 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Wed, 17 Sep 2025 17:18:26 +0800
Subject: [PATCH 025/121] change_build (#33)

* [Metax_change_ut]

* fix sum&collect_fpn_proposals op register

* modify profile

* [Metax] fix paddle bug replace 'MoeGradDispatchKernel' to 'MoeGateDispatchKernel'

* [Metax] register bce_loss_grad & bce_loss & index_add_grad kernels

* [Metax] con2d_grad use gpudnn

* blas handle support

* [Metax] register some kernels & update CMakeLists

* [Metax] fix metax unittest fail

* [Metax] add group_norm & label_smooth kernel and update matmul kernel

* [Metax] fix rmsprop kernel register and add meshgrid & meshgrid_grad kernel register

* add test

* add test

* [test]  chang the logic of workspace_host in cholesky_kernel_register

alloc(cpuplace,size), test pass
alloc(cpuplace, size, stream), crash

* [Metax] fix compile fail

* Revert "[Metax] fix compile fail"

This reverts commit 83bc87f686227962b0262e044225c6ed5507b824.

* [Metax] fix compile fail by 'conv_transpose_grad_kernel_impl.h'

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] con2d_grad use gpudnn

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] change_patch

* [Metax] update unit test CMakeLists.txt

* [Metax] update unit test CMakeLists.txt

* [feature] add unique_consecutive kernel

* [metax] add some kernel

* [metax] add some kernel

* [Metax] register baddbmm kernel & update blas api

* [Metax] register baddbmm kernel & update blas api

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [feature]  add add unique_consecutive kernel.cu

* [fix] fix some test case due to missing op register

* [fix]  fix some fail text

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax]fix lu eigvalshsqueeze rnn kernel

* add and fix some kernels

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [Metax] fix conflict

* [Metax] adapt to paddle-cpu-20250901 & resolve the issue of 'test_elementwise_mul_op_metax' failure

* [Metax] update repeat_interleave kernel & ignore max op test

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax] chang patch fix copy

* [metax] chang patch fix copy

* [Metax] update metax_gpu unit test

* [Metax] fix test CMakeList.txt

* [metax]change_cupti_and_fix_softmax

* [metax]change_patch

* [metax]change_patch

* [metax] updata_qr_kernel

* [metax] updata_qr_kernel

* [Metax] fix cufft and fix some blas kernel apply

* [metax] fix bug

* [Metax] add github action

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]fix_code style and index_elementwise_put_kernel

* [metax]change_build

* [metax]change_build

* change_metax_work

* change_metax_work

* change_metax_work

---------

Co-authored-by: Mingkun.Zhang <2496808993@qq.com>
Co-authored-by: metax666 <metax_pde@outlook.com>
Co-authored-by: jiaxinWang-metax <189149612@qq.com>
Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Co-authored-by: chezhang <1376507468@qq.com>
Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com>
Co-authored-by: ZhouDuan <1184319564@qq.com>
---
 backends/metax_gpu/build.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/backends/metax_gpu/build.sh b/backends/metax_gpu/build.sh
index de409153472..dbd583c52ea 100755
--- a/backends/metax_gpu/build.sh
+++ b/backends/metax_gpu/build.sh
@@ -22,8 +22,8 @@ pip  uninstall paddlepaddle -y
 
 # init paddle
 git submodule sync --recursive && git submodule update --init --recursive
-sleep 1000000
-unset http_proxy https_proxy
+# sleep 1000000
+# unset http_proxy https_proxy
 
 
 # export http_proxy=https://172.17.0.1:1080 https_proxy=http://10.2.192.21:1080

From b22fc1317d786931c1aa8784ad30dd72b6dfc2fd Mon Sep 17 00:00:00 2001
From: jxwangmetax <189149612@qq.com>
Date: Wed, 17 Sep 2025 17:58:21 +0800
Subject: [PATCH 026/121] [metax] modify fused_bias_dropout_residual_layer_norm
 (#34)

* modify cmake for warpctc and warprnnt

* modify conv for tf32 and fp32

* modify conv kernel

* modify library to static library

* modify kernel

* modify fused_bias_dropout_residual_layer_norm
---
 backends/metax_gpu/patch/paddle.patch | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch
index 667d9f75d1c..b7bdb953077 100755
--- a/backends/metax_gpu/patch/paddle.patch
+++ b/backends/metax_gpu/patch/paddle.patch
@@ -470,6 +470,25 @@ index 88663ec880..98b93072a3 100644
  #include "paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h"
  #include "paddle/phi/kernels/funcs/detail/gru_kernel.h"
  
+diff --git a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
+index 4eae698648..5c047723ea 100644
+--- a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
++++ b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
+@@ -43,11 +43,11 @@ template <typename T>
+ using LayerNormParamType = typename CudnnDataType<T>::BatchNormParamType;
+ 
+ inline static int GetDesiredBlockDim(int64_t block_dim) {
+-  const int kMaxBlockDim = 512;
++  const int kMaxBlockDim = 256;
+ #ifdef __HIPCC__
+   const int lwarpSize = 64;
+ #else
+-  const int lwarpSize = 32;
++  const int lwarpSize = 64;
+ #endif
+   return block_dim >= kMaxBlockDim ? kMaxBlockDim : lwarpSize;
+ }
+
 diff --git a/paddle/phi/kernels/funcs/math/context_project.h b/paddle/phi/kernels/funcs/math/context_project.h
 index 15e1a4a3c3..e4780538d7 100644
 --- a/paddle/phi/kernels/funcs/math/context_project.h

From c3d1444ef67441b9bb43f9fa5ee7c5a906a7f9df Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Wed, 17 Sep 2025 18:18:30 +0800
Subject: [PATCH 027/121] change_build (#35)

* [Metax_change_ut]

* fix sum&collect_fpn_proposals op register

* modify profile

* [Metax] fix paddle bug replace 'MoeGradDispatchKernel' to 'MoeGateDispatchKernel'

* [Metax] register bce_loss_grad & bce_loss & index_add_grad kernels

* [Metax] con2d_grad use gpudnn

* blas handle support

* [Metax] register some kernels & update CMakeLists

* [Metax] fix metax unittest fail

* [Metax] add group_norm & label_smooth kernel and update matmul kernel

* [Metax] fix rmsprop kernel register and add meshgrid & meshgrid_grad kernel register

* add test

* add test

* [test]  chang the logic of workspace_host in cholesky_kernel_register

alloc(cpuplace,size), test pass
alloc(cpuplace, size, stream), crash

* [Metax] fix compile fail

* Revert "[Metax] fix compile fail"

This reverts commit 83bc87f686227962b0262e044225c6ed5507b824.

* [Metax] fix compile fail by 'conv_transpose_grad_kernel_impl.h'

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] con2d_grad use gpudnn

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] change_patch

* [Metax] update unit test CMakeLists.txt

* [Metax] update unit test CMakeLists.txt

* [feature] add unique_consecutive kernel

* [metax] add some kernel

* [metax] add some kernel

* [Metax] register baddbmm kernel & update blas api

* [Metax] register baddbmm kernel & update blas api

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [feature]  add add unique_consecutive kernel.cu

* [fix] fix some test case due to missing op register

* [fix]  fix some fail text

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax]fix lu eigvalshsqueeze rnn kernel

* add and fix some kernels

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [Metax] fix conflict

* [Metax] adapt to paddle-cpu-20250901 & resolve the issue of 'test_elementwise_mul_op_metax' failure

* [Metax] update repeat_interleave kernel & ignore max op test

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax] chang patch fix copy

* [metax] chang patch fix copy

* [Metax] update metax_gpu unit test

* [Metax] fix test CMakeList.txt

* [metax]change_cupti_and_fix_softmax

* [metax]change_patch

* [metax]change_patch

* [metax] updata_qr_kernel

* [metax] updata_qr_kernel

* [Metax] fix cufft and fix some blas kernel apply

* [metax] fix bug

* [Metax] add github action

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]fix_code style and index_elementwise_put_kernel

* [metax]change_build

* [metax]change_build

* change_metax_work

* change_metax_work

* change_metax_work

* change_metax_work

---------

Co-authored-by: Mingkun.Zhang <2496808993@qq.com>
Co-authored-by: metax666 <metax_pde@outlook.com>
Co-authored-by: jiaxinWang-metax <189149612@qq.com>
Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Co-authored-by: chezhang <1376507468@qq.com>
Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com>
Co-authored-by: ZhouDuan <1184319564@qq.com>
---
 .github/workflows/metax_work.yaml | 6 ++++--
 backends/metax_gpu/build.sh       | 3 ++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/metax_work.yaml b/.github/workflows/metax_work.yaml
index c23112f0545..74de39c2e13 100644
--- a/.github/workflows/metax_work.yaml
+++ b/.github/workflows/metax_work.yaml
@@ -31,14 +31,16 @@ jobs:
             --depth=1 \
             --shallow-submodules \
             --jobs=8 \
-            --branch ${{ github.base_ref }} \
+            --branch ${{ github.base_ref || github.ref_name}} \
+
             --recurse-submodules \
             https://${{ github.actor }}:${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }}.git .
 
           if [ "${{ github.event_name }}" == "pull_request" ]; then
             git fetch origin pull/${{ github.event.pull_request.number }}/head:pull/${{ github.event.pull_request.number }}/head
             git checkout pull/${{ github.event.pull_request.number }}/head
-            git submodule update --init --recursive
+
+            # git submodule update --init --recursive
           fi
 
 
diff --git a/backends/metax_gpu/build.sh b/backends/metax_gpu/build.sh
index dbd583c52ea..042b779a05c 100755
--- a/backends/metax_gpu/build.sh
+++ b/backends/metax_gpu/build.sh
@@ -21,7 +21,8 @@ pip  uninstall paddlepaddle -y
 
 
 # init paddle
-git submodule sync --recursive && git submodule update --init --recursive
+# git submodule sync --recursive && git submodule update --init --recursive
+
 # sleep 1000000
 # unset http_proxy https_proxy
 

From 569a867b358d9d3707c8d41dbbb0641d03e75de8 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Wed, 17 Sep 2025 18:21:54 +0800
Subject: [PATCH 028/121] change_build (#36)

* [Metax_change_ut]

* fix sum&collect_fpn_proposals op register

* modify profile

* [Metax] fix paddle bug replace 'MoeGradDispatchKernel' to 'MoeGateDispatchKernel'

* [Metax] register bce_loss_grad & bce_loss & index_add_grad kernels

* [Metax] con2d_grad use gpudnn

* blas handle support

* [Metax] register some kernels & update CMakeLists

* [Metax] fix metax unittest fail

* [Metax] add group_norm & label_smooth kernel and update matmul kernel

* [Metax] fix rmsprop kernel register and add meshgrid & meshgrid_grad kernel register

* add test

* add test

* [test]  chang the logic of workspace_host in cholesky_kernel_register

alloc(cpuplace,size), test pass
alloc(cpuplace, size, stream), crash

* [Metax] fix compile fail

* Revert "[Metax] fix compile fail"

This reverts commit 83bc87f686227962b0262e044225c6ed5507b824.

* [Metax] fix compile fail by 'conv_transpose_grad_kernel_impl.h'

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] con2d_grad use gpudnn

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] change_patch

* [Metax] update unit test CMakeLists.txt

* [Metax] update unit test CMakeLists.txt

* [feature] add unique_consecutive kernel

* [metax] add some kernel

* [metax] add some kernel

* [Metax] register baddbmm kernel & update blas api

* [Metax] register baddbmm kernel & update blas api

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [feature]  add add unique_consecutive kernel.cu

* [fix] fix some test case due to missing op register

* [fix]  fix some fail text

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax]fix lu eigvalshsqueeze rnn kernel

* add and fix some kernels

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [Metax] fix conflict

* [Metax] adapt to paddle-cpu-20250901 & resolve the issue of 'test_elementwise_mul_op_metax' failure

* [Metax] update repeat_interleave kernel & ignore max op test

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax] chang patch fix copy

* [metax] chang patch fix copy

* [Metax] update metax_gpu unit test

* [Metax] fix test CMakeList.txt

* [metax]change_cupti_and_fix_softmax

* [metax]change_patch

* [metax]change_patch

* [metax] updata_qr_kernel

* [metax] updata_qr_kernel

* [Metax] fix cufft and fix some blas kernel apply

* [metax] fix bug

* [Metax] add github action

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]fix_code style and index_elementwise_put_kernel

* [metax]change_build

* [metax]change_build

* change_metax_work

* change_metax_work

* change_metax_work

* change_metax_work

* change_metax_work

---------

Co-authored-by: Mingkun.Zhang <2496808993@qq.com>
Co-authored-by: metax666 <metax_pde@outlook.com>
Co-authored-by: jiaxinWang-metax <189149612@qq.com>
Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Co-authored-by: chezhang <1376507468@qq.com>
Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com>
Co-authored-by: ZhouDuan <1184319564@qq.com>
---
 .github/workflows/metax_work.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/metax_work.yaml b/.github/workflows/metax_work.yaml
index 74de39c2e13..51c0c62cef6 100644
--- a/.github/workflows/metax_work.yaml
+++ b/.github/workflows/metax_work.yaml
@@ -32,7 +32,6 @@ jobs:
             --shallow-submodules \
             --jobs=8 \
             --branch ${{ github.base_ref || github.ref_name}} \
-
             --recurse-submodules \
             https://${{ github.actor }}:${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }}.git .
 

From 0edc6f6549fff51d459bf9a77bfbedf4e6a33beb Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Thu, 18 Sep 2025 10:46:15 +0800
Subject: [PATCH 029/121] change_warpctc.cmake (#38)

* change_warpctc.cmake
---
 backends/metax_gpu/cmake/warpctc.cmake | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/backends/metax_gpu/cmake/warpctc.cmake b/backends/metax_gpu/cmake/warpctc.cmake
index 0733c0f9ce5..ea8e2ade754 100644
--- a/backends/metax_gpu/cmake/warpctc.cmake
+++ b/backends/metax_gpu/cmake/warpctc.cmake
@@ -35,6 +35,13 @@ else()
       git checkout -- . && git checkout ${WARPCTC_TAG} && patch -Nd
       ${SOURCE_DIR} <
       ${PADDLE_SOURCE_DIR}/patches/warpctc/CMakeLists.txt.cuda.patch)
+  file(COPY ${CMAKE_SOURCE_DIR}/patch/intrinsics.cuh
+       DESTINATION ${SOURCE_DIR}/include/contrib/moderngpu/include/device/)
+  message(STATUS "atch file path: ${CMAKE_SOURCE_DIR}/patch/intrinsics.cuh")
+  message(
+    STATUS
+      "ModernGPU device path: ${SOURCE_DIR}/include/contrib/moderngpu/include/device/"
+  )
 endif()
 
 if(NOT WIN32 AND WITH_GPU)

From 2688c8664cc50961267be572ed467ce4b89bc351 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Thu, 18 Sep 2025 11:44:44 +0800
Subject: [PATCH 030/121] change_warpctc.cmake (#39)

* change warpctc.cmake
---
 backends/metax_gpu/change_patch.sh     | 3 ++-
 backends/metax_gpu/cmake/warpctc.cmake | 5 +++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/backends/metax_gpu/change_patch.sh b/backends/metax_gpu/change_patch.sh
index 60d74ec0f3d..f29986a3780 100644
--- a/backends/metax_gpu/change_patch.sh
+++ b/backends/metax_gpu/change_patch.sh
@@ -21,8 +21,9 @@ unzip mcEigen_3.4.0_paddle_final.zip
 mv mcEigen_3.4.0_paddle_final eigen3
 cd ..
 cp -r patch/eigen3/ ../../Paddle/third_party/eigen3
+rm -r patch/eigen3
 cp patch/tmp/mixed_vector* ../../Paddle/paddle/phi/core
 cd ../../Paddle/
 git apply --verbose ../backends/metax_gpu/patch/paddle.patch
 cd -
-cp -r patch/intrinsics.cuh ../../Paddle/third_party/warpctc/include/contrib/moderngpu/include/device/
+# cp -r patch/intrinsics.cuh ../../Paddle/third_party/warpctc/include/contrib/moderngpu/include/device/
diff --git a/backends/metax_gpu/cmake/warpctc.cmake b/backends/metax_gpu/cmake/warpctc.cmake
index ea8e2ade754..5d668032fb1 100644
--- a/backends/metax_gpu/cmake/warpctc.cmake
+++ b/backends/metax_gpu/cmake/warpctc.cmake
@@ -108,6 +108,10 @@ else()
   set(WARPCTC_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
 endif()
 
+set(COPY_COMMAND
+    ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/patch/intrinsics.cuh"
+    "${SOURCE_DIR}/include/contrib/moderngpu/include/device/")
+
 ExternalProject_Add(
   extern_warpctc
   ${EXTERNAL_PROJECT_LOG_ARGS}
@@ -117,6 +121,7 @@ ExternalProject_Add(
   PATCH_COMMAND
   COMMAND ${WARPCTC_PATCH_COMMAND}
   COMMAND ${WARPCTC_PATCH_CUDA_COMMAND}
+  COMMAND ${COPY_COMMAND}
   COMMAND ${WARPCTC_PATHCH_ROCM_COMMAND}
   # BUILD_ALWAYS    1
   CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}

From 6f031fe12a2020044b898b2b2921c899df3d4e3a Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Thu, 18 Sep 2025 12:10:23 +0800
Subject: [PATCH 031/121] test (#40)

* test

---------
---
 backends/metax_gpu/tests/run_test.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/backends/metax_gpu/tests/run_test.sh b/backends/metax_gpu/tests/run_test.sh
index 95cce650e6b..92dea2b492b 100755
--- a/backends/metax_gpu/tests/run_test.sh
+++ b/backends/metax_gpu/tests/run_test.sh
@@ -22,6 +22,8 @@ TEST_PATH1="${SCRIPT_DIR}/../../../python"
 TEST_PATH2="${SCRIPT_DIR}/../../../python/tests"
 export PYTHONPATH="${LEGACY_TEST_PATH}:${PYTHONPATH}:${TEST_PATH1}:${TEST_PATH2}"
 
+export
+sleep 1000000
 
 rm -r build
 mkdir -p build && cd build

From e84d399d6056f6dd017031514045a608e717b223 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Thu, 18 Sep 2025 14:42:12 +0800
Subject: [PATCH 032/121] test_ut (#41)

* change_run_ut

---------
---
 backends/metax_gpu/tests/run_test.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/backends/metax_gpu/tests/run_test.sh b/backends/metax_gpu/tests/run_test.sh
index 92dea2b492b..7d1e8e072a9 100755
--- a/backends/metax_gpu/tests/run_test.sh
+++ b/backends/metax_gpu/tests/run_test.sh
@@ -23,7 +23,8 @@ TEST_PATH2="${SCRIPT_DIR}/../../../python/tests"
 export PYTHONPATH="${LEGACY_TEST_PATH}:${PYTHONPATH}:${TEST_PATH1}:${TEST_PATH2}"
 
 export
-sleep 1000000
+# sleep 1000000
+
 
 rm -r build
 mkdir -p build && cd build
@@ -34,4 +35,4 @@ cmake ..
 cmake --build .
 
 
-ctest -j1 --output-on-failure
+ctest -j10 --output-on-failure

From b5f2feb398cae8217d1dff39a5e7ef31afa0e02d Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Thu, 18 Sep 2025 16:59:28 +0800
Subject: [PATCH 033/121] tets (#43)

* remove_tets

---------
---
 backends/metax_gpu/build.sh             | 2 +-
 backends/metax_gpu/tests/CMakeLists.txt | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/backends/metax_gpu/build.sh b/backends/metax_gpu/build.sh
index 042b779a05c..9ca589a7807 100755
--- a/backends/metax_gpu/build.sh
+++ b/backends/metax_gpu/build.sh
@@ -57,7 +57,7 @@ fi
 
 echo "make_maca"
 cd build
-cmake_maca .. -DPython3_EXECUTABLE=$(which python3) -DWITH_GPU=ON
+cmake_maca .. -DCMAKE_BUILD_TYPE=Release -DPython3_EXECUTABLE=$(which python3) -DWITH_GPU=ON
 make_maca -j60
 
 echo "install whl"
diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt
index 410ef006514..08273782be6 100755
--- a/backends/metax_gpu/tests/CMakeLists.txt
+++ b/backends/metax_gpu/tests/CMakeLists.txt
@@ -81,8 +81,7 @@ list(
   ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py
   ${PADDLE_LEGACY_TEST_PATH}/test_full_op.py
   ${PADDLE_LEGACY_TEST_PATH}/test_scatter_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_clip_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_reduce_op.py)
+  ${PADDLE_LEGACY_TEST_PATH}/test_clip_op.py)
 
 list(
   REMOVE_ITEM

From e20eca7e6f9846583293e988b7484380a25f314f Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Thu, 18 Sep 2025 18:53:51 +0800
Subject: [PATCH 034/121] test (#44)

* test

---------
---
 backends/metax_gpu/tests/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt
index 08273782be6..795a3c5b8ac 100755
--- a/backends/metax_gpu/tests/CMakeLists.txt
+++ b/backends/metax_gpu/tests/CMakeLists.txt
@@ -95,7 +95,7 @@ list(
   ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py
   ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py
   ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py
-  # op_test.py 里 self._get_places()接口适配问题
+  # op_test.py 里 self._get_places()接口的适配问题
   ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py
   ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py
   # device == "gpu" 适配问题

From e37f633a4d440a25126273ccddd7c3ff23288a02 Mon Sep 17 00:00:00 2001
From: jxwangmetax <189149612@qq.com>
Date: Fri, 19 Sep 2025 18:30:47 +0800
Subject: [PATCH 035/121] [metax] modify compile (#42)

* modify cmake for warpctc and warprnnt

* modify conv for tf32 and fp32

* modify conv kernel

* modify library to static library

* modify kernel

* modify fused_bias_dropout_residual_layer_norm

* modify compile

* modify blas
---
 backends/metax_gpu/CMakeLists.txt             |   40 +-
 backends/metax_gpu/compile.sh                 |    2 +-
 .../kernels/funcs/blas/blas_impl.cu.h         | 1270 ++++++++---------
 .../fused_adam_kernel_register.cu             |    0
 ...esidual_layer_norm_grad_kernel_register.cu |    0
 ...out_residual_layer_norm_kernel_register.cu |    0
 ...dding_eltwise_layernorm_kernel_register.cu |    0
 .../fused_layernorm_kernel_register.cu        |    0
 .../fused_seqpool_cvm_grad_kernel_register.cu |    0
 .../fused_seqpool_cvm_kernel_register.cu      |    0
 ...fused_softmax_mask_grad_kernel_register.cu |    0
 .../fused_softmax_mask_kernel_register.cu     |    0
 ...max_mask_upper_triangle_kernel_register.cu |    0
 ...d_stack_transpose_quant_kernel_register.cu |    0
 ...sed_swiglu_weighted_bwd_kernel_register.cu |   30 +
 .../fused_token_prune_kernel_register.cu      |    0
 ...d_transpose_split_quant_kernel_register.cu |    0
 ...nspose_wlch_split_quant_kernel_register.cu |    0
 .../kernels/metax_kernel/metax_context.cc     |   35 -
 .../kernels/metax_kernel/metax_context.h      |    2 -
 20 files changed, 597 insertions(+), 782 deletions(-)
 mode change 100755 => 100644 backends/metax_gpu/kernels/funcs/blas/blas_impl.cu.h
 rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_adam_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_bias_dropout_residual_layer_norm_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_bias_dropout_residual_layer_norm_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_embedding_eltwise_layernorm_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_layernorm_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_seqpool_cvm_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_seqpool_cvm_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_softmax_mask_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_softmax_mask_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_softmax_mask_upper_triangle_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_stack_transpose_quant_kernel_register.cu (100%)
 create mode 100644 backends/metax_gpu/kernels/fusion/fused_swiglu_weighted_bwd_kernel_register.cu
 rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_token_prune_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_transpose_split_quant_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_transpose_wlch_split_quant_kernel_register.cu (100%)

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index f282a9fbf7c..7b8c52f1f31 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -70,7 +70,6 @@ include(eigen)
 include(xxhash)
 include(zlib)
 include(protobuf)
-include(generate_pb)
 
 set(PROTO_FILE "${PADDLE_SOURCE_DIR}/paddle/phi/core/external_error.proto")
 get_filename_component(PROTO_WE "${PROTO_FILE}" NAME_WE)
@@ -614,12 +613,9 @@ file(
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/math_function.cc
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/log_softmax_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/backends/context_pool.cc
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/repeat_tensor2index_tensor.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/binomial_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/bernoulli_kernel.cu
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/bmm_grad_kernel_impl.h
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/bmm_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/cufft.cc
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/box_coder_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu
@@ -642,29 +638,11 @@ file(
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/gather_tree_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/graph_reindex_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/group_norm_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/group_norm_grad_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/gumbel_softmax_grad_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_act_dequant_kernel.cu
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/block_multi_head_attention_kernel.cu
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_weighted_swiglu_act_quant_kernel.cu
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_elemwise_activation_kernel.cu
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_upper_triangle_grad_kernel.cu
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/fp8_gemm/fp8_gemm_with_cublasLt/fp8_fp8_half_gemm.cu
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention_grad_kernel.cu
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/fused_conv2d_add_act_kernel.cu
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/variable_length_memory_efficient_attention_kernel.cu
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention_kernel.cu
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/gemm_epilogue_kernel.cu
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/blha_get_max_len.cu
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_elemwise_activation_grad_kernel.cu
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/stride/as_real_kernel.cc
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/stride/as_complex_kernel.cc
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/stride/complex_grad_kernel.cc
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/stride/complex_kernel.cc
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/selected_rows/shape_kernel.cc
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/sparse/gpu/conv_kernel_igemm.cu
+  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu
+  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/rms_norm_kernel.cu
   # ############################################################################
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu
   # kernels/kps
@@ -697,7 +675,6 @@ file(
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/legacy/gpu/cal_aux_loss_grad_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/legacy/gpu/expand_modality_expert_id_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/legacy/gpu/int_bincount_kernel.cu
-  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_bias_act_kernel.cu)
 
 file(
@@ -707,6 +684,8 @@ file(
   passes/*.cc
   kernels/*.cc
   kernels/*.cu
+  kernels/fusion/*.cc
+  kernels/fusion/*.cu
   kernels/gpudnn/*.cc
   kernels/gpudnn/*.cu
   kernels/cuda_kernels/*.cc
@@ -721,13 +700,7 @@ set_source_files_properties(${CUSTOM_DEVICE_SRCS} PROPERTIES LANGUAGE CUDA)
 set(CMAKE_CUCC_COMPILER "cucc")
 set(CMAKE_CUCC_FLAGS "-I /opt/maca/tools/cu-bridge/include/")
 
-set_source_files_properties(
-  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/rms_norm_kernel.cu
-  PROPERTIES LANGUAGE CUDA)
-add_library(
-  ${TARGET_NAME} SHARED
-  ${CUSTOM_DEVICE_SRCS}
-  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/rms_norm_kernel.cu)
+add_library(${TARGET_NAME} SHARED ${CUSTOM_DEVICE_SRCS})
 
 target_include_directories(
   ${TARGET_NAME}
@@ -753,9 +726,6 @@ target_link_libraries(
   ${WARPCTC_LIBRARIES}
   ${WARPRNNT_LIBRARIES}
   ${PADDLE_CORE_LIB})
-target_link_libraries(${TARGET_NAME} /opt/maca/lib/libmccl.so)
-target_link_libraries(${TARGET_NAME} /opt/maca/lib/libmcFlashAttn.so)
-target_link_libraries(${TARGET_NAME} /opt/maca/lib/libmcpti.so)
 include_directories(BEFORE ${PADDLE_SOURCE_DIR})
 
 target_compile_definitions(
diff --git a/backends/metax_gpu/compile.sh b/backends/metax_gpu/compile.sh
index e9860ccb7d0..eba45a9ced2 100644
--- a/backends/metax_gpu/compile.sh
+++ b/backends/metax_gpu/compile.sh
@@ -30,7 +30,7 @@ fi
 
 echo "make_maca"
 cd build
-cmake_maca .. -DPython3_EXECUTABLE=$(which python3) -DWITH_GPU=ON
+cmake_maca .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DPython3_EXECUTABLE=$(which python3) -DWITH_GPU=ON
 make_maca -j10
 
 
diff --git a/backends/metax_gpu/kernels/funcs/blas/blas_impl.cu.h b/backends/metax_gpu/kernels/funcs/blas/blas_impl.cu.h
old mode 100755
new mode 100644
index 419387cc9c4..ae4baa52613
--- a/backends/metax_gpu/kernels/funcs/blas/blas_impl.cu.h
+++ b/backends/metax_gpu/kernels/funcs/blas/blas_impl.cu.h
@@ -34,70 +34,6 @@ PHI_DECLARE_bool(gemm_use_half_precision_compute_type);
 
 namespace phi {
 namespace funcs {
-
-inline static cublasHandle_t blas_handle_ = nullptr;
-inline static cublasHandle_t blas_tensor_core_handle_ = nullptr;
-inline static cublasHandle_t blas_tf32_tensor_core_handle_ = nullptr;
-
-inline std::once_flag flag_sparse_;
-inline std::once_flag flag_blas_;
-inline std::once_flag flag_blaslt_;
-inline std::once_flag flag_dnn_;
-inline std::once_flag flag_solver_;
-inline std::once_flag flag_cublas_;
-inline std::once_flag flag_tensorcore_cublas_;
-inline std::once_flag flag_eigen_device_;
-
-inline std::mutex blas_mtx_;
-inline std::mutex blas_tensor_core_mtx_;
-inline std::mutex blas_tf32_mtx_;
-inline std::mutex sparse_mtx_;
-inline std::mutex stream_call_back_mtx_;
-
-inline void InitBlasHandle(cublasHandle_t *blas_handle, gpuStream_t stream) {
-  PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasCreate(blas_handle));
-  PADDLE_RETRY_CUDA_SUCCESS(
-      phi::dynload::cublasSetStream(*blas_handle, stream));
-}
-
-inline void CublasCall(const std::function<void(cublasHandle_t)> &callback,
-                       phi::stream::stream_t stream) {
-  std::call_once(flag_cublas_, [&]() {
-    if (!blas_handle_) InitBlasHandle(&blas_handle_, stream);
-    if (!blas_tensor_core_handle_) {
-      InitBlasHandle(&blas_tensor_core_handle_, stream);
-      PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
-          blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH));
-    }
-  });
-  std::lock_guard<std::mutex> guard(blas_mtx_);
-  callback(blas_handle_);
-}
-
-inline bool MetaxTensorCoreAvailable() {
-  return blas_tensor_core_handle_ != nullptr;
-}
-
-inline void TensorCoreCublasCallIfAvailable(
-    const std::function<void(cublasHandle_t)> &callback,
-    phi::stream::stream_t stream) {
-  std::call_once(flag_tensorcore_cublas_, [&]() {
-    if (!blas_handle_) InitBlasHandle(&blas_handle_, stream);
-    if (!blas_tensor_core_handle_) {
-      InitBlasHandle(&blas_tensor_core_handle_, stream);
-      PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
-          blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH));
-    }
-  });
-  if (blas_tensor_core_handle_ != nullptr) {
-    std::lock_guard<std::mutex> guard(blas_tensor_core_mtx_);
-    callback(blas_tensor_core_handle_);
-  } else {
-    std::lock_guard<std::mutex> guard(blas_mtx_);
-    callback(blas_handle_);
-  }
-}
-
 template <typename T>
 struct CUBlas;
 
@@ -174,28 +110,26 @@ struct CUBlas<float> {
 // here.
 #if CUDA_VERSION >= 8000
     VLOG(5) << "use_tensor_op_math: "
-            << (MetaxTensorCoreAvailable() ? "True" : "False");
-    TensorCoreCublasCallIfAvailable(
-        [&](cublasHandle_t handle) {
-          PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasSgemmEx(handle,
-                                                                 transa,
-                                                                 transb,
-                                                                 m,
-                                                                 n,
-                                                                 k,
-                                                                 alpha,
-                                                                 A,
-                                                                 Atype,
-                                                                 lda,
-                                                                 B,
-                                                                 Btype,
-                                                                 ldb,
-                                                                 beta,
-                                                                 C,
-                                                                 Ctype,
-                                                                 ldc));
-        },
-        dev_ctx->stream());
+            << (dev_ctx->tensor_core_available() ? "True" : "False");
+    dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasSgemmEx(handle,
+                                                             transa,
+                                                             transb,
+                                                             m,
+                                                             n,
+                                                             k,
+                                                             alpha,
+                                                             A,
+                                                             Atype,
+                                                             lda,
+                                                             B,
+                                                             Btype,
+                                                             ldb,
+                                                             beta,
+                                                             C,
+                                                             Ctype,
+                                                             ldc));
+    });
 #else
     PADDLE_THROW(phi::errors::Unimplemented(
         "cublasSgemmEx is not supported on cuda <= 7.5"));
@@ -376,7 +310,7 @@ struct CUBlas<phi::dtype::float16> {
 #if CUDA_VERSION >= 8000
     cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
 #if CUDA_VERSION >= 9000
-    bool use_tensor_op_math = MetaxTensorCoreAvailable();
+    bool use_tensor_op_math = dev_ctx->tensor_core_available();
     if (use_tensor_op_math) {
       algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
     }
@@ -386,31 +320,29 @@ struct CUBlas<phi::dtype::float16> {
     thrust::device_vector<const void *> A_ptr(A, A + batchCount);
     thrust::device_vector<const void *> B_ptr(B, B + batchCount);
     thrust::device_vector<void *> C_ptr(C, C + batchCount);
-    TensorCoreCublasCallIfAvailable(
-        [&](cublasHandle_t handle) {
-          PADDLE_ENFORCE_GPU_SUCCESS(
-              phi::dynload::cublasGemmBatchedEx(handle,
-                                                transa,
-                                                transb,
-                                                m,
-                                                n,
-                                                k,
-                                                alpha,
-                                                A_ptr.data().get(),
-                                                Atype,
-                                                lda,
-                                                B_ptr.data().get(),
-                                                Btype,
-                                                ldb,
-                                                beta,
-                                                C_ptr.data().get(),
-                                                Ctype,
-                                                ldc,
-                                                batchCount,
-                                                computeType,
-                                                algo));
-        },
-        dev_ctx->stream());
+    dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          phi::dynload::cublasGemmBatchedEx(handle,
+                                            transa,
+                                            transb,
+                                            m,
+                                            n,
+                                            k,
+                                            alpha,
+                                            A_ptr.data().get(),
+                                            Atype,
+                                            lda,
+                                            B_ptr.data().get(),
+                                            Btype,
+                                            ldb,
+                                            beta,
+                                            C_ptr.data().get(),
+                                            Ctype,
+                                            ldc,
+                                            batchCount,
+                                            computeType,
+                                            algo));
+    });
 #else
     PADDLE_THROW(phi::errors::Unimplemented(
         "cublasGemmBatchedEx is not supported on cuda <= 7.5"));
@@ -486,7 +418,7 @@ struct CUBlas<phi::dtype::float16> {
 #if CUDA_VERSION >= 8000
     cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
 #if CUDA_VERSION >= 9000
-    bool use_tensor_op_math = MetaxTensorCoreAvailable();
+    bool use_tensor_op_math = dev_ctx->tensor_core_available();
     if (use_tensor_op_math) {
       algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
     }
@@ -494,29 +426,27 @@ struct CUBlas<phi::dtype::float16> {
             << (use_tensor_op_math ? "True" : "False");
 #endif  // CUDA_VERSION >= 9000
 
-    TensorCoreCublasCallIfAvailable(
-        [&](cublasHandle_t handle) {
-          PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle,
-                                                                transa,
-                                                                transb,
-                                                                m,
-                                                                n,
-                                                                k,
-                                                                alpha,
-                                                                A,
-                                                                Atype,
-                                                                lda,
-                                                                B,
-                                                                Btype,
-                                                                ldb,
-                                                                beta,
-                                                                C,
-                                                                Ctype,
-                                                                ldc,
-                                                                computeType,
-                                                                algo));
-        },
-        dev_ctx->stream());
+    dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle,
+                                                            transa,
+                                                            transb,
+                                                            m,
+                                                            n,
+                                                            k,
+                                                            alpha,
+                                                            A,
+                                                            Atype,
+                                                            lda,
+                                                            B,
+                                                            Btype,
+                                                            ldb,
+                                                            beta,
+                                                            C,
+                                                            Ctype,
+                                                            ldc,
+                                                            computeType,
+                                                            algo));
+    });
 #else
     PADDLE_THROW(phi::errors::Unimplemented(
         "cublasGemmEx is not supported on cuda <= 7.5"));
@@ -696,7 +626,7 @@ struct CUBlas<phi::dtype::complex<float>> {
 #if CUDA_VERSION >= 8000
     cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
 #if CUDA_VERSION >= 9000
-    bool use_tensor_op_math = MetaxTensorCoreAvailable();
+    bool use_tensor_op_math = dev_ctx->tensor_core_available();
     if (use_tensor_op_math) {
       algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
     }
@@ -704,29 +634,27 @@ struct CUBlas<phi::dtype::complex<float>> {
             << (use_tensor_op_math ? "True" : "False");
 #endif  // CUDA_VERSION >= 9000
 
-    TensorCoreCublasCallIfAvailable(
-        [&](cublasHandle_t handle) {
-          PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle,
-                                                                transa,
-                                                                transb,
-                                                                m,
-                                                                n,
-                                                                k,
-                                                                alpha,
-                                                                A,
-                                                                Atype,
-                                                                lda,
-                                                                B,
-                                                                Btype,
-                                                                ldb,
-                                                                beta,
-                                                                C,
-                                                                Ctype,
-                                                                ldc,
-                                                                computeType,
-                                                                algo));
-        },
-        dev_ctx->stream());
+    dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle,
+                                                            transa,
+                                                            transb,
+                                                            m,
+                                                            n,
+                                                            k,
+                                                            alpha,
+                                                            A,
+                                                            Atype,
+                                                            lda,
+                                                            B,
+                                                            Btype,
+                                                            ldb,
+                                                            beta,
+                                                            C,
+                                                            Ctype,
+                                                            ldc,
+                                                            computeType,
+                                                            algo));
+    });
 #else
     PADDLE_THROW(phi::errors::Unimplemented(
         "cublasGemmEx is not supported on cuda <= 7.5"));
@@ -1024,7 +952,7 @@ struct CUBlas<phi::dtype::complex<double>> {
 #if CUDA_VERSION >= 8000
     cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
 #if CUDA_VERSION >= 9000
-    bool use_tensor_op_math = MetaxTensorCoreAvailable();
+    bool use_tensor_op_math = dev_ctx->tensor_core_available();
     if (use_tensor_op_math) {
       algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
     }
@@ -1032,29 +960,27 @@ struct CUBlas<phi::dtype::complex<double>> {
             << (use_tensor_op_math ? "True" : "False");
 #endif  // CUDA_VERSION >= 9000
 
-    TensorCoreCublasCallIfAvailable(
-        [&](cublasHandle_t handle) {
-          PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle,
-                                                                transa,
-                                                                transb,
-                                                                m,
-                                                                n,
-                                                                k,
-                                                                alpha,
-                                                                A,
-                                                                Atype,
-                                                                lda,
-                                                                B,
-                                                                Btype,
-                                                                ldb,
-                                                                beta,
-                                                                C,
-                                                                Ctype,
-                                                                ldc,
-                                                                computeType,
-                                                                algo));
-        },
-        dev_ctx->stream());
+    dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle,
+                                                            transa,
+                                                            transb,
+                                                            m,
+                                                            n,
+                                                            k,
+                                                            alpha,
+                                                            A,
+                                                            Atype,
+                                                            lda,
+                                                            B,
+                                                            Btype,
+                                                            ldb,
+                                                            beta,
+                                                            C,
+                                                            Ctype,
+                                                            ldc,
+                                                            computeType,
+                                                            algo));
+    });
 #else
     PADDLE_THROW(phi::errors::Unimplemented(
         "cublasGemmEx is not supported on cuda <= 7.5"));
@@ -1186,24 +1112,22 @@ void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
       PADDLE_THROW(common::errors::Unimplemented(
           "GEMM_EX_64 is not supported on cuda < 12.3"));
     } else {
-      CublasCall(
-          [&](cublasHandle_t handle) {
-            CUBlas<T>::GEMM(handle,
-                            cuTransB,
-                            cuTransA,
-                            N,
-                            M,
-                            K,
-                            &alpha,
-                            B,
-                            ldb,
-                            A,
-                            lda,
-                            &beta,
-                            C,
-                            N);
-          },
-          dev_ctx_.stream());
+      dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+        CUBlas<T>::GEMM(handle,
+                        cuTransB,
+                        cuTransA,
+                        N,
+                        M,
+                        K,
+                        &alpha,
+                        B,
+                        ldb,
+                        A,
+                        lda,
+                        &beta,
+                        C,
+                        N);
+      });
     }
 
 #if CUDA_VERSION >= 8000
@@ -1271,24 +1195,22 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
 #else
   // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
 
-  CublasCall(
-      [&](cublasHandle_t handle) {
-        CUBlas<phi::dtype::float16>::GEMM(handle,
-                                          cuTransB,
-                                          cuTransA,
-                                          N,
-                                          M,
-                                          K,
-                                          &h_alpha,
-                                          h_B,
-                                          ldb,
-                                          h_A,
-                                          lda,
-                                          &h_beta,
-                                          h_C,
-                                          N);
-      },
-      dev_ctx_.stream());
+  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<phi::dtype::float16>::GEMM(handle,
+                                      cuTransB,
+                                      cuTransA,
+                                      N,
+                                      M,
+                                      K,
+                                      &h_alpha,
+                                      h_B,
+                                      ldb,
+                                      h_A,
+                                      lda,
+                                      &h_beta,
+                                      h_C,
+                                      N);
+  });
 #endif  // CUDA_VERSION >= 8000
 }
 
@@ -1352,24 +1274,22 @@ void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
       PADDLE_THROW(common::errors::Unimplemented(
           "GEMM_EX_64 is not supported on cuda < 12.3"));
     } else {
-      CublasCall(
-          [&](cublasHandle_t handle) {
-            CUBlas<T>::GEMM(handle,
-                            cuTransB,
-                            cuTransA,
-                            static_cast<int>(N),
-                            static_cast<int>(M),
-                            static_cast<int>(K),
-                            &t_alpha,
-                            B,
-                            static_cast<int>(ldb),
-                            A,
-                            static_cast<int>(lda),
-                            &t_beta,
-                            C,
-                            static_cast<int>(N));
-          },
-          dev_ctx_.stream());
+      dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+        CUBlas<T>::GEMM(handle,
+                        cuTransB,
+                        cuTransA,
+                        static_cast<int>(N),
+                        static_cast<int>(M),
+                        static_cast<int>(K),
+                        &t_alpha,
+                        B,
+                        static_cast<int>(ldb),
+                        A,
+                        static_cast<int>(lda),
+                        &t_beta,
+                        C,
+                        static_cast<int>(N));
+      });
     }
 
 #if CUDA_VERSION >= 8000
@@ -1447,24 +1367,22 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
                                          CUBLAS_COMPUTE_32F);
 #else
     // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
-    CublasCall(
-        [&](cublasHandle_t handle) {
-          CUBlas<phi::dtype::float16>::GEMM(handle,
-                                            cuTransB,
-                                            cuTransA,
-                                            static_cast<int>(N),
-                                            static_cast<int>(M),
-                                            static_cast<int>(K),
-                                            &h_alpha,
-                                            h_B,
-                                            static_cast<int>(ldb),
-                                            h_A,
-                                            static_cast<int>(lda),
-                                            &h_beta,
-                                            h_C,
-                                            static_cast<int>(N));
-        },
-        dev_ctx_.stream());
+    dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+      CUBlas<phi::dtype::float16>::GEMM(handle,
+                                        cuTransB,
+                                        cuTransA,
+                                        static_cast<int>(N),
+                                        static_cast<int>(M),
+                                        static_cast<int>(K),
+                                        &h_alpha,
+                                        h_B,
+                                        static_cast<int>(ldb),
+                                        h_A,
+                                        static_cast<int>(lda),
+                                        &h_beta,
+                                        h_C,
+                                        static_cast<int>(N));
+    });
 #endif  // CUDA_VERSION >= 8000
   }
 }
@@ -1503,7 +1421,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
   float h_beta = static_cast<float>(beta);
 
   cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT;
-  bool use_tensor_op_math = MetaxTensorCoreAvailable();
+  bool use_tensor_op_math = dev_ctx_.tensor_core_available();
   if (use_tensor_op_math) {
     algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
   }
@@ -1519,30 +1437,27 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
 #endif  // CUDA_VERSION >= 12030
   } else {
     CheckGEMMNSize(N);
-    TensorCoreCublasCallIfAvailable(
-        [&](cublasHandle_t handle) {
-          PADDLE_ENFORCE_GPU_SUCCESS(
-              phi::dynload::cublasGemmEx(handle,
-                                         cuTransB,
-                                         cuTransA,
-                                         N,
-                                         M,
-                                         K,
-                                         &h_alpha,
-                                         B,
-                                         CUDA_R_16BF,
-                                         ldb,
-                                         A,
-                                         CUDA_R_16BF,
-                                         lda,
-                                         &h_beta,
-                                         C,
-                                         CUDA_R_16BF,
-                                         N,
-                                         CUBLAS_COMPUTE_32F,
-                                         algo));
-        },
-        dev_ctx_.stream());
+    dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle,
+                                                            cuTransB,
+                                                            cuTransA,
+                                                            N,
+                                                            M,
+                                                            K,
+                                                            &h_alpha,
+                                                            B,
+                                                            CUDA_R_16BF,
+                                                            ldb,
+                                                            A,
+                                                            CUDA_R_16BF,
+                                                            lda,
+                                                            &h_beta,
+                                                            C,
+                                                            CUDA_R_16BF,
+                                                            N,
+                                                            CUBLAS_COMPUTE_32F,
+                                                            algo));
+    });
   }
 #else
   // raise error
@@ -1621,24 +1536,22 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
 #else
     // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
 
-    CublasCall(
-        [&](cublasHandle_t handle) {
-          CUBlas<phi::dtype::complex<float>>::GEMM(handle,
-                                                   cuTransB,
-                                                   cuTransA,
-                                                   static_cast<int>(N),
-                                                   static_cast<int>(M),
-                                                   static_cast<int>(K),
-                                                   &c_alpha,
-                                                   h_B,
-                                                   static_cast<int>(ldb),
-                                                   h_A,
-                                                   static_cast<int>(lda),
-                                                   &c_beta,
-                                                   h_C,
-                                                   static_cast<int>(N));
-        },
-        dev_ctx_.stream());
+    dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+      CUBlas<phi::dtype::complex<float>>::GEMM(handle,
+                                               cuTransB,
+                                               cuTransA,
+                                               static_cast<int>(N),
+                                               static_cast<int>(M),
+                                               static_cast<int>(K),
+                                               &c_alpha,
+                                               h_B,
+                                               static_cast<int>(ldb),
+                                               h_A,
+                                               static_cast<int>(lda),
+                                               &c_beta,
+                                               h_C,
+                                               static_cast<int>(N));
+    });
 #endif  // CUDA_VERSION >= 8000
   }
 }
@@ -1713,24 +1626,22 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
 #else
     // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
 
-    CublasCall(
-        [&](cublasHandle_t handle) {
-          CUBlas<phi::dtype::complex<double>>::GEMM(handle,
-                                                    cuTransB,
-                                                    cuTransA,
-                                                    static_cast<int>(N),
-                                                    static_cast<int>(M),
-                                                    static_cast<int>(K),
-                                                    &c_alpha,
-                                                    h_B,
-                                                    static_cast<int>(ldb),
-                                                    h_A,
-                                                    static_cast<int>(lda),
-                                                    &c_beta,
-                                                    h_C,
-                                                    static_cast<int>(N));
-        },
-        dev_ctx_.stream());
+    dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+      CUBlas<phi::dtype::complex<double>>::GEMM(handle,
+                                                cuTransB,
+                                                cuTransA,
+                                                static_cast<int>(N),
+                                                static_cast<int>(M),
+                                                static_cast<int>(K),
+                                                &c_alpha,
+                                                h_B,
+                                                static_cast<int>(ldb),
+                                                h_A,
+                                                static_cast<int>(lda),
+                                                &c_beta,
+                                                h_C,
+                                                static_cast<int>(N));
+    });
 #endif  // CUDA_VERSION >= 8000
   }
 }
@@ -1769,7 +1680,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
   float h_beta = beta;
 
   cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT;
-  bool use_tensor_op_math = MetaxTensorCoreAvailable();
+  bool use_tensor_op_math = dev_ctx_.tensor_core_available();
   if (use_tensor_op_math) {
     algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
   }
@@ -1784,30 +1695,28 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
 #endif  // CUDA_VERSION >= 12030
   } else {
     CheckGEMMNSize(N);
-    TensorCoreCublasCallIfAvailable(
-        [&](cublasHandle_t handle) {
-          PADDLE_ENFORCE_GPU_SUCCESS(
-              phi::dynload::cublasGemmEx(handle,
-                                         cuTransB,
-                                         cuTransA,
-                                         static_cast<int>(N),
-                                         static_cast<int>(M),
-                                         static_cast<int>(K),
-                                         &h_alpha,
-                                         B,
-                                         CUDA_R_16BF,
-                                         static_cast<int>(ldb),
-                                         A,
-                                         CUDA_R_16BF,
-                                         static_cast<int>(lda),
-                                         &h_beta,
-                                         C,
-                                         CUDA_R_16BF,
-                                         static_cast<int>(N),
-                                         CUDA_R_32F,
-                                         algo));
-        },
-        dev_ctx_.stream());
+    dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          phi::dynload::cublasGemmEx(handle,
+                                     cuTransB,
+                                     cuTransA,
+                                     static_cast<int>(N),
+                                     static_cast<int>(M),
+                                     static_cast<int>(K),
+                                     &h_alpha,
+                                     B,
+                                     CUDA_R_16BF,
+                                     static_cast<int>(ldb),
+                                     A,
+                                     CUDA_R_16BF,
+                                     static_cast<int>(lda),
+                                     &h_beta,
+                                     C,
+                                     CUDA_R_16BF,
+                                     static_cast<int>(N),
+                                     CUDA_R_32F,
+                                     algo));
+    });
   }
 #else
   // raise error
@@ -1860,24 +1769,22 @@ void Blas<phi::GPUContext>::GEMM(bool transA,
   } else {
 #endif  // CUDA_VERSION >= 8000
 
-    CublasCall(
-        [&](cublasHandle_t handle) {
-          CUBlas<T>::GEMM(handle,
-                          cuTransB,
-                          cuTransA,
-                          N,
-                          M,
-                          K,
-                          &alpha,
-                          B,
-                          ldb,
-                          A,
-                          lda,
-                          &beta,
-                          C,
-                          ldc);
-        },
-        dev_ctx_.stream());
+    dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+      CUBlas<T>::GEMM(handle,
+                      cuTransB,
+                      cuTransA,
+                      N,
+                      M,
+                      K,
+                      &alpha,
+                      B,
+                      ldb,
+                      A,
+                      lda,
+                      &beta,
+                      C,
+                      ldc);
+    });
 
 #if CUDA_VERSION >= 8000
   }
@@ -1904,24 +1811,22 @@ inline void Blas<phi::GPUContext>::GEMM(bool transA,
   cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
   cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
 
-  CublasCall(
-      [&](cublasHandle_t handle) {
-        CUBlas<phi::dtype::float16>::GEMM(handle,
-                                          cuTransB,
-                                          cuTransA,
-                                          N,
-                                          M,
-                                          K,
-                                          &alpha,
-                                          B,
-                                          ldb,
-                                          A,
-                                          lda,
-                                          &beta,
-                                          C,
-                                          ldc);
-      },
-      dev_ctx_.stream());
+  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<phi::dtype::float16>::GEMM(handle,
+                                      cuTransB,
+                                      cuTransA,
+                                      N,
+                                      M,
+                                      K,
+                                      &alpha,
+                                      B,
+                                      ldb,
+                                      A,
+                                      lda,
+                                      &beta,
+                                      C,
+                                      ldc);
+  });
 }
 
 template <>
@@ -1957,36 +1862,33 @@ inline void Blas<phi::GPUContext>::GEMM(bool transA,
   float h_beta = static_cast<float>(beta);
 
   cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT;
-  bool use_tensor_op_math = MetaxTensorCoreAvailable();
+  bool use_tensor_op_math = dev_ctx_.tensor_core_available();
   if (use_tensor_op_math) {
     algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
   }
   VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False");
 
-  TensorCoreCublasCallIfAvailable(
-      [&](cublasHandle_t handle) {
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::cublasGemmEx(handle,
-                                       cuTransB,
-                                       cuTransA,
-                                       N,
-                                       M,
-                                       K,
-                                       &h_alpha,
-                                       B,
-                                       CUDA_R_16BF,
-                                       ldb,
-                                       A,
-                                       CUDA_R_16BF,
-                                       lda,
-                                       &h_beta,
-                                       C,
-                                       CUDA_R_16BF,
-                                       ldc,
-                                       CUBLAS_COMPUTE_32F,
-                                       algo));
-      },
-      dev_ctx_.stream());
+  dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle,
+                                                          cuTransB,
+                                                          cuTransA,
+                                                          N,
+                                                          M,
+                                                          K,
+                                                          &h_alpha,
+                                                          B,
+                                                          CUDA_R_16BF,
+                                                          ldb,
+                                                          A,
+                                                          CUDA_R_16BF,
+                                                          lda,
+                                                          &h_beta,
+                                                          C,
+                                                          CUDA_R_16BF,
+                                                          ldc,
+                                                          CUBLAS_COMPUTE_32F,
+                                                          algo));
+  });
 #else
   // raise error
   PADDLE_THROW(phi::errors::Unimplemented(
@@ -1998,27 +1900,23 @@ inline void Blas<phi::GPUContext>::GEMM(bool transA,
 template <>
 template <typename T>
 void Blas<phi::GPUContext>::AXPY(int n, T alpha, const T *x, T *y) const {
-  CublasCall(
-      [&](cublasHandle_t handle) {
-        CUBlas<T>::AXPY(handle, n, &alpha, x, 1, y, 1);
-      },
-      dev_ctx_.stream());
+  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::AXPY(handle, n, &alpha, x, 1, y, 1);
+  });
 }
 
 template <>
 template <typename T>
 void Blas<phi::GPUContext>::SCAL(int n, const T alpha, T *x) const {
-  CublasCall(
-      [&](cublasHandle_t handle) { CUBlas<T>::SCAL(handle, n, &alpha, x, 1); },
-      dev_ctx_.stream());
+  dev_ctx_.CublasCall(
+      [&](cublasHandle_t handle) { CUBlas<T>::SCAL(handle, n, &alpha, x, 1); });
 }
 
 template <>
 template <typename T>
 void Blas<phi::GPUContext>::VCOPY(int n, const T *x, T *y) const {
-  CublasCall(
-      [&](cublasHandle_t handle) { CUBlas<T>::VCOPY(handle, n, x, 1, y, 1); },
-      dev_ctx_.stream());
+  dev_ctx_.CublasCall(
+      [&](cublasHandle_t handle) { CUBlas<T>::VCOPY(handle, n, x, 1, y, 1); });
 }
 
 template <>
@@ -2033,12 +1931,9 @@ void Blas<phi::GPUContext>::GEMV(bool trans_a,
                                  T *C) const {
   cublasOperation_t cuTransA = !trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
 
-  CublasCall(
-      [&](cublasHandle_t handle) {
-        CUBlas<T>::GEMV(
-            handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1);
-      },
-      dev_ctx_.stream());
+  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::GEMV(handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1);
+  });
 }
 
 template <>
@@ -2112,7 +2007,7 @@ void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
   if ((FLAGS_enable_cublas_tensor_op_math && (std::is_same<T, float>::value)) ||
       std::is_same<T, phi::dtype::float16>::value) {
     cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-    bool use_tensor_op_math = MetaxTensorCoreAvailable();
+    bool use_tensor_op_math = dev_ctx_.tensor_core_available();
     if (use_tensor_op_math) {
       algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
     }
@@ -2153,60 +2048,56 @@ void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
           "cublasGemmStridedBatchedEx_64 is not supported on cuda < 12.3"));
 #endif  // CUDA_VERSION >= 12030
     } else {
-      TensorCoreCublasCallIfAvailable(
-          [&](cublasHandle_t handle) {
-            PADDLE_ENFORCE_GPU_SUCCESS(
-                phi::dynload::cublasGemmStridedBatchedEx(handle,
-                                                         cuTransB,
-                                                         cuTransA,
-                                                         N,
-                                                         M,
-                                                         K,
-                                                         a,
-                                                         B,
-                                                         fp,
-                                                         ldb,
-                                                         strideB,
-                                                         A,
-                                                         fp,
-                                                         lda,
-                                                         strideA,
-                                                         b,
-                                                         C,
-                                                         fp,
-                                                         ldc,
-                                                         strideC,
-                                                         batchCount,
-                                                         compute_type,
-                                                         algo));
-          },
-          dev_ctx_.stream());
+      dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            phi::dynload::cublasGemmStridedBatchedEx(handle,
+                                                     cuTransB,
+                                                     cuTransA,
+                                                     N,
+                                                     M,
+                                                     K,
+                                                     a,
+                                                     B,
+                                                     fp,
+                                                     ldb,
+                                                     strideB,
+                                                     A,
+                                                     fp,
+                                                     lda,
+                                                     strideA,
+                                                     b,
+                                                     C,
+                                                     fp,
+                                                     ldc,
+                                                     strideC,
+                                                     batchCount,
+                                                     compute_type,
+                                                     algo));
+      });
     }
   } else {
 #endif  // CUDA_VERSION >= 9010
 
-    CublasCall(
-        [&](cublasHandle_t handle) {
-          CUBlas<T>::GEMM_STRIDED_BATCH(handle,
-                                        cuTransB,
-                                        cuTransA,
-                                        static_cast<int>(N),
-                                        static_cast<int>(M),
-                                        static_cast<int>(K),
-                                        &alpha,
-                                        B,
-                                        static_cast<int>(ldb),
-                                        strideB,
-                                        A,
-                                        static_cast<int>(lda),
-                                        strideA,
-                                        &beta,
-                                        C,
-                                        ldc,
-                                        strideC,
-                                        static_cast<int>(batchCount));
-        },
-        dev_ctx_.stream());
+    dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+      CUBlas<T>::GEMM_STRIDED_BATCH(handle,
+                                    cuTransB,
+                                    cuTransA,
+                                    static_cast<int>(N),
+                                    static_cast<int>(M),
+                                    static_cast<int>(K),
+                                    &alpha,
+                                    B,
+                                    static_cast<int>(ldb),
+                                    strideB,
+                                    A,
+                                    static_cast<int>(lda),
+                                    strideA,
+                                    &beta,
+                                    C,
+                                    ldc,
+                                    strideC,
+                                    static_cast<int>(batchCount));
+    });
 
 #if CUDA_VERSION >= 9010
   }
@@ -2242,7 +2133,7 @@ void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
   if ((FLAGS_enable_cublas_tensor_op_math && (std::is_same<T, float>::value)) ||
       std::is_same<T, phi::dtype::float16>::value) {
     cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-    bool use_tensor_op_math = MetaxTensorCoreAvailable();
+    bool use_tensor_op_math = dev_ctx_.tensor_core_available();
     if (use_tensor_op_math) {
       algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
     }
@@ -2284,61 +2175,57 @@ void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
           "cublasGemmStridedBatchedEx_64 is not supported on cuda < 12.3"));
 #endif  // CUDA_VERSION >= 12030
     } else {
-      TensorCoreCublasCallIfAvailable(
-          [&](cublasHandle_t handle) {
-            PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmStridedBatchedEx(
-                handle,
-                cuTransB,
-                cuTransA,
-                static_cast<int>(N),
-                static_cast<int>(M),
-                static_cast<int>(K),
-                a,
-                B,
-                fp,
-                static_cast<int>(ldb),
-                strideB,
-                A,
-                fp,
-                static_cast<int>(lda),
-                strideA,
-                b,
-                C,
-                fp,
-                static_cast<int>(ldc),
-                strideC,
-                static_cast<int>(batchCount),
-                compute_type,
-                algo));
-          },
-          dev_ctx_.stream());
+      dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+        PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmStridedBatchedEx(
+            handle,
+            cuTransB,
+            cuTransA,
+            static_cast<int>(N),
+            static_cast<int>(M),
+            static_cast<int>(K),
+            a,
+            B,
+            fp,
+            static_cast<int>(ldb),
+            strideB,
+            A,
+            fp,
+            static_cast<int>(lda),
+            strideA,
+            b,
+            C,
+            fp,
+            static_cast<int>(ldc),
+            strideC,
+            static_cast<int>(batchCount),
+            compute_type,
+            algo));
+      });
     }
   } else {
 #endif  // CUDA_VERSION >= 9010
     T h_alpha = static_cast<T>(alpha);
     T h_beta = static_cast<T>(beta);
-    CublasCall(
-        [&](cublasHandle_t handle) {
-          CUBlas<T>::GEMM_STRIDED_BATCH(handle,
-                                        cuTransB,
-                                        cuTransA,
-                                        static_cast<int>(N),
-                                        static_cast<int>(M),
-                                        static_cast<int>(K),
-                                        &h_alpha,
-                                        B,
-                                        static_cast<int>(ldb),
-                                        strideB,
-                                        A,
-                                        static_cast<int>(lda),
-                                        strideA,
-                                        &h_beta,
-                                        C,
-                                        static_cast<int>(ldc),
-                                        strideC,
-                                        static_cast<int>(batchCount));
-        },
-        dev_ctx_.stream());
+    dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+      CUBlas<T>::GEMM_STRIDED_BATCH(handle,
+                                    cuTransB,
+                                    cuTransA,
+                                    static_cast<int>(N),
+                                    static_cast<int>(M),
+                                    static_cast<int>(K),
+                                    &h_alpha,
+                                    B,
+                                    static_cast<int>(ldb),
+                                    strideB,
+                                    A,
+                                    static_cast<int>(lda),
+                                    strideA,
+                                    &h_beta,
+                                    C,
+                                    static_cast<int>(ldc),
+                                    strideC,
+                                    static_cast<int>(batchCount));
+    });
 
 #if CUDA_VERSION >= 9010
   }
@@ -2377,7 +2264,7 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
   float h_beta = static_cast<float>(beta);
 
   cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-  bool use_tensor_op_math = MetaxTensorCoreAvailable();
+  bool use_tensor_op_math = dev_ctx_.tensor_core_available();
   if (use_tensor_op_math) {
     algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
   }
@@ -2392,34 +2279,32 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
         "cublasGemmStridedBatchedEx_64 is not supported on cuda < 12.3"));
 #endif  // CUDA_VERSION >= 12030
   } else {
-    TensorCoreCublasCallIfAvailable(
-        [&](cublasHandle_t handle) {
-          PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmStridedBatchedEx(
-              handle,
-              cuTransB,
-              cuTransA,
-              static_cast<int>(N),
-              static_cast<int>(M),
-              static_cast<int>(K),
-              &h_alpha,
-              B,
-              CUDA_R_16BF,
-              static_cast<int>(ldb),
-              strideB,
-              A,
-              CUDA_R_16BF,
-              static_cast<int>(lda),
-              strideA,
-              &h_beta,
-              C,
-              CUDA_R_16BF,
-              static_cast<int>(ldc),
-              strideC,
-              static_cast<int>(batchCount),
-              CUBLAS_COMPUTE_32F,
-              algo));
-        },
-        dev_ctx_.stream());
+    dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          phi::dynload::cublasGemmStridedBatchedEx(handle,
+                                                   cuTransB,
+                                                   cuTransA,
+                                                   static_cast<int>(N),
+                                                   static_cast<int>(M),
+                                                   static_cast<int>(K),
+                                                   &h_alpha,
+                                                   B,
+                                                   CUDA_R_16BF,
+                                                   static_cast<int>(ldb),
+                                                   strideB,
+                                                   A,
+                                                   CUDA_R_16BF,
+                                                   static_cast<int>(lda),
+                                                   strideA,
+                                                   &h_beta,
+                                                   C,
+                                                   CUDA_R_16BF,
+                                                   static_cast<int>(ldc),
+                                                   strideC,
+                                                   static_cast<int>(batchCount),
+                                                   CUBLAS_COMPUTE_32F,
+                                                   algo));
+    });
   }
 #else
   // raise error
@@ -2460,7 +2345,7 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
   float h_beta = beta;
 
   cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-  bool use_tensor_op_math = MetaxTensorCoreAvailable();
+  bool use_tensor_op_math = dev_ctx_.tensor_core_available();
   if (use_tensor_op_math) {
     algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
   }
@@ -2475,34 +2360,32 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
         "cublasGemmStridedBatchedEx_64 is not supported on cuda < 12.3"));
 #endif  // CUDA_VERSION >= 12030
   } else {
-    TensorCoreCublasCallIfAvailable(
-        [&](cublasHandle_t handle) {
-          PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmStridedBatchedEx(
-              handle,
-              cuTransB,
-              cuTransA,
-              static_cast<int>(N),
-              static_cast<int>(M),
-              static_cast<int>(K),
-              &h_alpha,
-              B,
-              CUDA_R_16BF,
-              static_cast<int>(ldb),
-              strideB,
-              A,
-              CUDA_R_16BF,
-              static_cast<int>(lda),
-              strideA,
-              &h_beta,
-              C,
-              CUDA_R_16BF,
-              static_cast<int>(ldc),
-              strideC,
-              static_cast<int>(batchCount),
-              CUBLAS_COMPUTE_32F,
-              algo));
-        },
-        dev_ctx_.stream());
+    dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          phi::dynload::cublasGemmStridedBatchedEx(handle,
+                                                   cuTransB,
+                                                   cuTransA,
+                                                   static_cast<int>(N),
+                                                   static_cast<int>(M),
+                                                   static_cast<int>(K),
+                                                   &h_alpha,
+                                                   B,
+                                                   CUDA_R_16BF,
+                                                   static_cast<int>(ldb),
+                                                   strideB,
+                                                   A,
+                                                   CUDA_R_16BF,
+                                                   static_cast<int>(lda),
+                                                   strideA,
+                                                   &h_beta,
+                                                   C,
+                                                   CUDA_R_16BF,
+                                                   static_cast<int>(ldc),
+                                                   strideC,
+                                                   static_cast<int>(batchCount),
+                                                   CUBLAS_COMPUTE_32F,
+                                                   algo));
+    });
   }
 #else
   // raise error
@@ -2547,7 +2430,7 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
 //        (std::is_same<float16, float>::value)) ||
 //       std::is_same<float16, phi::dtype::float16>::value) {
 //     cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-//     bool use_tensor_op_math = MetaxTensorCoreAvailable();
+//     bool use_tensor_op_math = dev_ctx_.tensor_core_available();
 //     if (use_tensor_op_math) {
 //       algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
 //     }
@@ -2579,7 +2462,7 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
 // #endif
 //     }
 
-//     TensorCoreCublasCallIfAvailable(
+//     dev_ctx_.TensorCoreCublasCallIfAvailable(
 //         [&](cublasHandle_t handle) {
 //           PADDLE_ENFORCE_GPU_SUCCESS(
 //               phi::dynload::cublasGemmStridedBatchedEx(handle,
@@ -2605,12 +2488,11 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
 //                                                        batchCount,
 //                                                        compute_type,
 //                                                        algo));
-//         },
-//         dev_ctx_.stream());
+//         });
 //   } else {
 // #endif  // CUDA_VERSION >= 9010
 
-//     CublasCall(
+//     dev_ctx_.CublasCall(
 //         [&](cublasHandle_t handle) {
 //           CUBlas<float16>::GEMM_STRIDED_BATCH(handle,
 //                                               cuTransB,
@@ -2667,7 +2549,7 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
 //   cublasOperation_t cuTransB =
 //       (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
 //   const int64_t strideC = M * N;
-//   CublasCall(
+//   dev_ctx_.CublasCall(
 //       [&](cublasHandle_t handle) {
 //         PADDLE_ENFORCE_GPU_SUCCESS(
 //             phi::dynload::cublasDgemmStridedBatched(handle,
@@ -2723,14 +2605,14 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
 //   float h_beta = static_cast<float>(beta);
 
 //   cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-//   bool use_tensor_op_math = MetaxTensorCoreAvailable();
+//   bool use_tensor_op_math = dev_ctx->tensor_core_available();
 //   if (use_tensor_op_math) {
 //     algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
 //   }
 //   VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" :
 //   "False");
 
-//   TensorCoreCublasCallIfAvailable(
+//   dev_ctx_.TensorCoreCublasCallIfAvailable(
 //       [&](cublasHandle_t handle) {
 //         PADDLE_ENFORCE_GPU_SUCCESS(
 //             phi::dynload::cublasGemmStridedBatchedEx(handle,
@@ -2756,8 +2638,7 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
 //                                                      batchCount,
 //                                                      CUBLAS_COMPUTE_32F,
 //                                                      algo));
-//       },
-//       dev_ctx_.stream());
+//       });
 // #else
 //   // raise error
 //   PADDLE_THROW(phi::errors::Unimplemented(
@@ -2812,25 +2693,23 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
   thrust::device_vector<const double *> B_ptr(B, B + batchCount);
   thrust::device_vector<double *> C_ptr(C, C + batchCount);
 
-  CublasCall(
-      [&](cublasHandle_t handle) {
-        CUBlas<double>::GEMM_BATCH(handle,
-                                   cuTransB,
-                                   cuTransA,
-                                   N,
-                                   M,
-                                   K,
-                                   &alpha,
-                                   B_ptr.data().get(),
-                                   ldb,
-                                   A_ptr.data().get(),
-                                   lda,
-                                   &beta,
-                                   C_ptr.data().get(),
-                                   ldc,
-                                   batchCount);
-      },
-      dev_ctx_.stream());
+  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<double>::GEMM_BATCH(handle,
+                               cuTransB,
+                               cuTransA,
+                               N,
+                               M,
+                               K,
+                               &alpha,
+                               B_ptr.data().get(),
+                               ldb,
+                               A_ptr.data().get(),
+                               lda,
+                               &beta,
+                               C_ptr.data().get(),
+                               ldc,
+                               batchCount);
+  });
 }
 
 template <>
@@ -2859,25 +2738,23 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
   thrust::device_vector<const float *> B_ptr(B, B + batchCount);
   thrust::device_vector<float *> C_ptr(C, C + batchCount);
 
-  CublasCall(
-      [&](cublasHandle_t handle) {
-        CUBlas<float>::GEMM_BATCH(handle,
-                                  cuTransB,
-                                  cuTransA,
-                                  N,
-                                  M,
-                                  K,
-                                  &alpha,
-                                  B_ptr.data().get(),
-                                  ldb,
-                                  A_ptr.data().get(),
-                                  lda,
-                                  &beta,
-                                  C_ptr.data().get(),
-                                  ldc,
-                                  batchCount);
-      },
-      dev_ctx_.stream());
+  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<float>::GEMM_BATCH(handle,
+                              cuTransB,
+                              cuTransA,
+                              N,
+                              M,
+                              K,
+                              &alpha,
+                              B_ptr.data().get(),
+                              ldb,
+                              A_ptr.data().get(),
+                              lda,
+                              &beta,
+                              C_ptr.data().get(),
+                              ldc,
+                              batchCount);
+  });
 }
 
 template <>
@@ -2970,7 +2847,7 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
   float f_beta = static_cast<float>(beta);
 
   cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT;
-  bool use_tensor_op_math = MetaxTensorCoreAvailable();
+  bool use_tensor_op_math = dev_ctx_.tensor_core_available();
   if (use_tensor_op_math) {
     algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
   }
@@ -2979,31 +2856,29 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
   thrust::device_vector<const void *> A_ptr(A, A + batchCount);
   thrust::device_vector<const void *> B_ptr(B, B + batchCount);
   thrust::device_vector<void *> C_ptr(C, C + batchCount);
-  TensorCoreCublasCallIfAvailable(
-      [&](cublasHandle_t handle) {
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::cublasGemmBatchedEx(handle,
-                                              cuTransB,
-                                              cuTransA,
-                                              N,
-                                              M,
-                                              K,
-                                              &f_alpha,
-                                              B_ptr.data().get(),
-                                              CUDA_R_16BF,
-                                              ldb,
-                                              A_ptr.data().get(),
-                                              CUDA_R_16BF,
-                                              lda,
-                                              &f_beta,
-                                              C_ptr.data().get(),
-                                              CUDA_R_16BF,
-                                              ldc,
-                                              batchCount,
-                                              CUBLAS_COMPUTE_32F,
-                                              algo));
-      },
-      dev_ctx_.stream());
+  dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cublasGemmBatchedEx(handle,
+                                          cuTransB,
+                                          cuTransA,
+                                          N,
+                                          M,
+                                          K,
+                                          &f_alpha,
+                                          B_ptr.data().get(),
+                                          CUDA_R_16BF,
+                                          ldb,
+                                          A_ptr.data().get(),
+                                          CUDA_R_16BF,
+                                          lda,
+                                          &f_beta,
+                                          C_ptr.data().get(),
+                                          CUDA_R_16BF,
+                                          ldc,
+                                          batchCount,
+                                          CUBLAS_COMPUTE_32F,
+                                          algo));
+  });
 #else
   // raise error
   PADDLE_THROW(phi::errors::Unimplemented(
@@ -3038,33 +2913,19 @@ void Blas<phi::GPUContext>::TRSM(CBLAS_SIDE side,
   cublasDiagType_t cuDiag =
       (diag == CblasUnit) ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT;
 
-  CublasCall(
-      [&](cublasHandle_t handle) {
-        CUBlas<T>::TRSM(handle,
-                        cuSide,
-                        cuUplo,
-                        cuTransA,
-                        cuDiag,
-                        N,
-                        M,
-                        &alpha,
-                        A,
-                        lda,
-                        B,
-                        ldb);
-      },
-      dev_ctx_.stream());
+  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::TRSM(
+        handle, cuSide, cuUplo, cuTransA, cuDiag, N, M, &alpha, A, lda, B, ldb);
+  });
 }
 
 template <>
 template <typename T>
 void Blas<phi::GPUContext>::BatchedGETRF(
     int n, T **a, int *ipiv, int *info, int batch_size) const {
-  CublasCall(
-      [&](cublasHandle_t handle) {
-        CUBlas<T>::GETRF_BATCH(handle, n, a, n, ipiv, info, batch_size);
-      },
-      dev_ctx_.stream());
+  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::GETRF_BATCH(handle, n, a, n, ipiv, info, batch_size);
+  });
 }
 
 template <>
@@ -3084,23 +2945,18 @@ void Blas<phi::GPUContext>::BatchedGETRI(int n,
           "overlap memory space of input matrix (address: %p).",
           a_inv,
           a));
-  CublasCall(
-      [&](cublasHandle_t handle) {
-        CUBlas<T>::GETRI_BATCH(
-            handle, n, a, n, ipiv, a_inv, n, info, batch_size);
-      },
-      dev_ctx_.stream());
+  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::GETRI_BATCH(handle, n, a, n, ipiv, a_inv, n, info, batch_size);
+  });
 }
 
 template <>
 template <typename T>
 void Blas<phi::GPUContext>::BatchedMatInv(
     int n, const T **a, T **a_inv, int *info, int batch_size) const {
-  CublasCall(
-      [&](cublasHandle_t handle) {
-        CUBlas<T>::MATINV_BATCH(handle, n, a, n, a_inv, n, info, batch_size);
-      },
-      dev_ctx_.stream());
+  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::MATINV_BATCH(handle, n, a, n, a_inv, n, info, batch_size);
+  });
 }
 
 template <>
@@ -3118,12 +2974,10 @@ void Blas<phi::GPUContext>::BatchedGETRS(CBLAS_TRANSPOSE trans,
   // use CUBLAS_OP_C (conjugate transpose) for complex
   cublasOperation_t cuTrans =
       (trans == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  CublasCall(
-      [&](cublasHandle_t handle) {
-        CUBlas<T>::GETRS_BATCH(
-            handle, cuTrans, n, nrhs, a, lda, ipiv, b, ldb, info, batch_size);
-      },
-      dev_ctx_.stream());
+  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::GETRS_BATCH(
+        handle, cuTrans, n, nrhs, a, lda, ipiv, b, ldb, info, batch_size);
+  });
 }
 
 template <>
@@ -3152,23 +3006,21 @@ void Blas<phi::GPUContext>::BatchedTRSM(CBLAS_SIDE side,
   cublasDiagType_t cuDiag =
       (diag == CblasUnit) ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT;
 
-  CublasCall(
-      [&](cublasHandle_t handle) {
-        CUBlas<T>::TRSM_BATCH(handle,
-                              cuSide,
-                              cuUplo,
-                              cuTransA,
-                              cuDiag,
-                              N,
-                              M,
-                              &alpha,
-                              A,
-                              lda,
-                              B,
-                              ldb,
-                              batch_size);
-      },
-      dev_ctx_.stream());
+  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::TRSM_BATCH(handle,
+                          cuSide,
+                          cuUplo,
+                          cuTransA,
+                          cuDiag,
+                          N,
+                          M,
+                          &alpha,
+                          A,
+                          lda,
+                          B,
+                          ldb,
+                          batch_size);
+  });
 }
 
 }  // namespace funcs
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_adam_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_adam_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_adam_kernel_register.cu
rename to backends/metax_gpu/kernels/fusion/fused_adam_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_bias_dropout_residual_layer_norm_grad_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_bias_dropout_residual_layer_norm_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_bias_dropout_residual_layer_norm_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/fusion/fused_bias_dropout_residual_layer_norm_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_bias_dropout_residual_layer_norm_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_bias_dropout_residual_layer_norm_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_bias_dropout_residual_layer_norm_kernel_register.cu
rename to backends/metax_gpu/kernels/fusion/fused_bias_dropout_residual_layer_norm_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_embedding_eltwise_layernorm_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_embedding_eltwise_layernorm_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_embedding_eltwise_layernorm_kernel_register.cu
rename to backends/metax_gpu/kernels/fusion/fused_embedding_eltwise_layernorm_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_layernorm_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_layernorm_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_layernorm_kernel_register.cu
rename to backends/metax_gpu/kernels/fusion/fused_layernorm_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_seqpool_cvm_grad_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_seqpool_cvm_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_seqpool_cvm_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/fusion/fused_seqpool_cvm_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_seqpool_cvm_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_seqpool_cvm_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_seqpool_cvm_kernel_register.cu
rename to backends/metax_gpu/kernels/fusion/fused_seqpool_cvm_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_softmax_mask_grad_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_softmax_mask_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_softmax_mask_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/fusion/fused_softmax_mask_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_softmax_mask_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_softmax_mask_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_softmax_mask_kernel_register.cu
rename to backends/metax_gpu/kernels/fusion/fused_softmax_mask_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_softmax_mask_upper_triangle_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_softmax_mask_upper_triangle_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_softmax_mask_upper_triangle_kernel_register.cu
rename to backends/metax_gpu/kernels/fusion/fused_softmax_mask_upper_triangle_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_stack_transpose_quant_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_stack_transpose_quant_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_stack_transpose_quant_kernel_register.cu
rename to backends/metax_gpu/kernels/fusion/fused_stack_transpose_quant_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/fusion/fused_swiglu_weighted_bwd_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_swiglu_weighted_bwd_kernel_register.cu
new file mode 100644
index 00000000000..08876233bfb
--- /dev/null
+++ b/backends/metax_gpu/kernels/fusion/fused_swiglu_weighted_bwd_kernel_register.cu
@@ -0,0 +1,30 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/fusion/gpu/fused_swiglu_weighted_bwd_kernel.cu"  //NOLINT
+
+PD_CUSTOM_KERNEL_REGISTER(fused_swiglu_weighted_bwd,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::FusedSwigluWeightedBwdKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t,
+                          phi::bfloat16) {
+  kernel->OutputAt(0).SetDataType(phi::DataType::BFLOAT16);
+  kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+  kernel->OutputAt(2).SetDataType(phi::DataType::BFLOAT16);
+}
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_token_prune_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_token_prune_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_token_prune_kernel_register.cu
rename to backends/metax_gpu/kernels/fusion/fused_token_prune_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_transpose_split_quant_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_transpose_split_quant_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_transpose_split_quant_kernel_register.cu
rename to backends/metax_gpu/kernels/fusion/fused_transpose_split_quant_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_transpose_wlch_split_quant_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_transpose_wlch_split_quant_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_transpose_wlch_split_quant_kernel_register.cu
rename to backends/metax_gpu/kernels/fusion/fused_transpose_wlch_split_quant_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/metax_kernel/metax_context.cc b/backends/metax_gpu/kernels/metax_kernel/metax_context.cc
index 62aaa5fb2de..a388387de45 100644
--- a/backends/metax_gpu/kernels/metax_kernel/metax_context.cc
+++ b/backends/metax_gpu/kernels/metax_kernel/metax_context.cc
@@ -15,25 +15,6 @@
 #include "kernels/metax_kernel/metax_context.h"
 
 namespace phi {
-const bool allow_tf32_cublas = []() -> bool {
-  const char* v = std::getenv("ALLOW_TF32_CUBLAS");
-  if (v) {
-    return std::atoi(v);
-  }
-  return false;
-}();
-
-const bool allow_tf32_cudnn = []() -> bool {
-  const char* v = std::getenv("ALLOW_TF32_CUDNN");
-  if (v) {
-    return std::atoi(v);
-  }
-  return false;
-}();
-
-bool AllowTF32Cublas() { return allow_tf32_cublas; }
-bool AllowTF32Cudnn() { return allow_tf32_cudnn; }
-
 void DnnWorkspaceHandle::RunFuncSync(
     const std::function<void(void*)>& cudnn_func,
     size_t required_workspace_bytes,
@@ -87,20 +68,4 @@ static void InitBlasLtHandle(blasLtHandle_t* blaslt_handle) {
   phi::dynload::hipblasLtCreate(blaslt_handle);
 #endif
 }
-
-blasLtHandle_t GetBlasLtHandle() {
-  std::call_once(flag_blaslt_, [&]() {
-    if (!blaslt_handle_) {
-      if (!blaslt_handle_creator_)
-        InitBlasLtHandle(&blaslt_handle_);
-      else
-        blaslt_handle_ = blaslt_handle_creator_();
-    }
-  });
-  PADDLE_ENFORCE_NOT_NULL(
-      blaslt_handle_,
-      common::errors::InvalidArgument(
-          "The GPU blasLt handle is nullptr. It must not be null."));
-  return blaslt_handle_;
-}
 }  // namespace phi
diff --git a/backends/metax_gpu/kernels/metax_kernel/metax_context.h b/backends/metax_gpu/kernels/metax_kernel/metax_context.h
index a6610c1dab2..2339e18a4a6 100644
--- a/backends/metax_gpu/kernels/metax_kernel/metax_context.h
+++ b/backends/metax_gpu/kernels/metax_kernel/metax_context.h
@@ -128,8 +128,6 @@ inline void InitCusolverDnHandle(cusolverDnHandle_t* handle,
   }
 }
 
-bool AllowTF32Cublas();
-bool AllowTF32Cudnn();
 inline cusolverDnHandle_t GetCusolverDnHandle(gpuStream_t stream, Place place) {
   std::call_once(flag_cusolver_dn_, [&]() {
     if (!cusolver_dn_handle_) {

From 1af5148d20ce28e202fb0ac672f266c807d98b17 Mon Sep 17 00:00:00 2001
From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Date: Fri, 19 Sep 2025 18:31:14 +0800
Subject: [PATCH 036/121] [Metax] add log analysis script (#46)

* [Metax] fix dgc & mklml compile product path problem

* [Metax] update metax_gpu CMakeLists.txt

* [Metax] organize documents

* [Metax] add log analysis script
---
 .../metax_gpu/tests/scripts/classify.json     |  22 ++
 .../metax_gpu/tests/scripts/log_analysis.py   | 216 ++++++++++++++++++
 2 files changed, 238 insertions(+)
 create mode 100644 backends/metax_gpu/tests/scripts/classify.json
 create mode 100644 backends/metax_gpu/tests/scripts/log_analysis.py

diff --git a/backends/metax_gpu/tests/scripts/classify.json b/backends/metax_gpu/tests/scripts/classify.json
new file mode 100644
index 00000000000..b97255adc3d
--- /dev/null
+++ b/backends/metax_gpu/tests/scripts/classify.json
@@ -0,0 +1,22 @@
+{
+    "OK":{
+        "skipped":{
+            "rule":["skipped="]
+        }
+    },
+
+    "FAILED":{
+        "precision":{
+            "rule":["Mismatched elements"]
+        },
+        "api":{
+            "rule":["(PermissionDenied) Cannot use CUDAPinnedPlace", "ValueError: The API paddle.device.cuda.get_device_properties", "TypeError: paddle.index_add api"]
+        },
+        "missing":{
+            "rule":["missing metax_gpu kernel", "UnimplementedError: There are no kernels which are registered"]
+        },
+        "file_not_found":{
+            "rule":["FileNotFoundError:"]
+        }
+    }
+}
diff --git a/backends/metax_gpu/tests/scripts/log_analysis.py b/backends/metax_gpu/tests/scripts/log_analysis.py
new file mode 100644
index 00000000000..c0716f5b6f5
--- /dev/null
+++ b/backends/metax_gpu/tests/scripts/log_analysis.py
@@ -0,0 +1,216 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import fnmatch
+import shutil
+from enum import Enum
+
+
+class TestResult(Enum):
+    OK = "OK"
+    FAILURE = "FAILED"
+
+
+class LogAnalyzer:
+    def __init__(
+        self,
+        classify_file: str,
+        search_path: str,
+        pattern: str = None,
+        encoding: str = "utf-8",
+    ):
+        self.__patten = pattern
+        self.__search_path = search_path
+        self.__encoding = encoding
+        self.__statistical_data = {}
+
+        self.__classify_data = self.__read_json_file(classify_file)
+        for key, value in self.__classify_data.items():
+            self.__statistical_data[key] = {}
+            for sub_key in list(value.keys()):
+                self.__statistical_data[key][sub_key] = []
+
+        self.__statistical_data[TestResult.OK.value]["noskip"] = []
+        self.__statistical_data[TestResult.FAILURE.value]["other"] = []
+
+    def __read_json_file(self, path: str) -> dict:
+        with open(path, "r", encoding=self.__encoding) as f:
+            data = json.load(f)
+        f.close()
+        return data
+
+    def __check_path(self, path: str) -> None:
+        """
+        处理指定路径：
+        - 若为文件夹路径：不存在则创建，存在则清空内容
+        - 若为文件路径：不存在则创建，存在则清空内容
+        """
+        try:
+            # 判断路径是否存在
+            if os.path.exists(path):
+                # 路径存在，判断是文件还是文件夹
+                if os.path.isfile(path):
+                    # 处理文件：清空内容
+                    with open(path, "w", encoding="utf-8") as f:
+                        f.write("")  # 写入空内容清空文件
+                    # print(f"文件已存在，已清空内容: {path}")
+
+                elif os.path.isdir(path):
+                    # 处理文件夹：清空所有内容
+                    for item in os.listdir(path):
+                        item_path = os.path.join(path, item)
+                        if os.path.isfile(item_path) or os.path.islink(item_path):
+                            os.remove(item_path)  # 删除文件或链接
+                        elif os.path.isdir(item_path):
+                            shutil.rmtree(item_path)  # 递归删除子文件夹
+                    # print(f"文件夹已存在，已清空内容: {path}")
+            else:
+                # 路径不存在，判断目标类型（根据最后一个元素是否有扩展名）
+                # 获取路径的最后一部分
+                last_part = os.path.basename(path)
+
+                # 判断是否为文件路径（包含扩展名）
+                if "." in last_part and not last_part.endswith("."):
+                    # 创建文件（包括父目录）
+                    parent_dir = os.path.dirname(path)
+                    if parent_dir and not os.path.exists(parent_dir):
+                        os.makedirs(parent_dir, exist_ok=True)
+                    with open(path, "w", encoding="utf-8") as f:
+                        pass  # 创建空文件
+                    # print(f"文件不存在，已创建: {path}")
+
+                else:
+                    # 创建文件夹（支持多级目录）
+                    os.makedirs(path, exist_ok=True)
+                    # print(f"文件夹不存在，已创建: {path}")
+
+        except PermissionError:
+            print(f"权限错误：无法操作路径 {path}")
+        except Exception as e:
+            print(f"处理路径时发生错误: {str(e)}")
+
+    def save_result(self, dir_path: str = "./") -> None:
+        """
+        判断文件夹是否存在：
+        - 不存在则创建
+        - 存在则清空文件夹内所有内容（保留文件夹本身）
+        """
+
+        for key, value in self.__statistical_data.items():
+            sub_dir = os.path.join(dir_path, key)
+            self.__check_path(sub_dir)
+
+            for sub_key, sub_value in value.items():
+                # print(f"{sub_key}: {len(value[sub_key])} - ({sub_value})")
+                try:
+                    with open(
+                        os.path.join(sub_dir, sub_key) + ".txt", "w", encoding="utf-8"
+                    ) as f:
+                        for op_name in sub_value:
+                            if not op_name.endswith("\n"):
+                                op_name += "\n"
+                            f.write(op_name)
+                    # print(f"内容已成功{'追加' if append else '写入'}到 {file_path}")
+                except Exception as e:
+                    print(f"写入文件失败: {e}")
+
+    def show_result(self) -> None:
+        test_counts = 0
+        for key, value in self.__statistical_data.items():
+            print(f"\n----------  {key}  ----------")
+            for sub_key, sub_value in value.items():
+                test_counts = test_counts + len(value[sub_key])
+                print(f"{sub_key}: {len(value[sub_key])}\n\t{sub_value}\n")
+        print(
+            f"\n******************* Total log num: {test_counts} *******************\n\n"
+        )
+
+    def run(self):
+        """
+        读取指定目录下符合命名规则的文件，并遍历每一行
+
+        参数:
+            search_path: 要搜索的根目录
+            pattern: 文件名匹配规则（支持通配符，如 '*.txt', 'file_*.log')
+        """
+        for dirpath, dirnames, filenames in os.walk(self.__search_path):
+            for filename in fnmatch.filter(filenames, self.__patten):
+                file_path = os.path.join(dirpath, filename)
+                # print(f"\n===== 正在处理文件: {file_path} =====")
+
+                cur_res_type = TestResult.FAILURE
+                cur_sub_type = "other"
+                pre_line = None
+                finish_early = False
+
+                try:
+                    with open(file_path, "r", encoding=self.__encoding) as f:
+                        for line in f:
+                            for sub_type, sub_type_params in self.__classify_data[
+                                cur_res_type.value
+                            ].items():
+                                for keyword in sub_type_params["rule"]:
+                                    if keyword in line:
+                                        cur_sub_type = sub_type
+                                        if sub_type == "missing":
+                                            finish_early = True
+                                        break
+
+                                if finish_early:
+                                    break
+
+                            pre_line = line
+                            if finish_early:
+                                break
+
+                        if "OK" in pre_line:
+                            cur_res_type = TestResult.OK
+                            cur_sub_type = None
+                            for sub_type, sub_type_params in self.__classify_data[
+                                cur_res_type.value
+                            ].items():
+                                for rule in sub_type_params["rule"]:
+                                    if rule in line:
+                                        cur_sub_type = sub_type
+
+                        op_name = filename.split(".")
+                        if cur_sub_type is None:
+                            self.__statistical_data[cur_res_type.value][
+                                "noskip"
+                            ].append(op_name[0])
+                        else:
+                            self.__statistical_data[cur_res_type.value][
+                                cur_sub_type
+                            ].append(op_name[0])
+                        # print(f"Result: {cur_res_type.value}, type: {cur_sub_type}")
+                    f.close()
+                except UnicodeDecodeError:
+                    print(f"警告: 文件 {file_path} 编码不是 utf-8,跳过处理")
+                except Exception as e:
+                    print(f"处理文件 {file_path} 时出错: {str(e)}")
+
+
+if __name__ == "__main__":
+
+    analyzer = LogAnalyzer(
+        classify_file="./classify.json",
+        search_path="./NPU_logs/20250918_065326",
+        pattern="test_*.log",
+    )
+
+    analyzer.run()
+    analyzer.show_result()
+    analyzer.save_result("./output")

From 518bee8382cdb7879f38e8b81e719aa8853b825e Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Fri, 19 Sep 2025 19:07:47 +0800
Subject: [PATCH 037/121] add_generate_pb (#47)

* add_generate_pb

---------
---
 backends/metax_gpu/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index 7b8c52f1f31..78b4c9c566b 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -70,6 +70,7 @@ include(eigen)
 include(xxhash)
 include(zlib)
 include(protobuf)
+include(generate_pb)
 
 set(PROTO_FILE "${PADDLE_SOURCE_DIR}/paddle/phi/core/external_error.proto")
 get_filename_component(PROTO_WE "${PROTO_FILE}" NAME_WE)

From bc02549e7450cffb6b6925ef199b6f6fcbd63259 Mon Sep 17 00:00:00 2001
From: jxwangmetax <189149612@qq.com>
Date: Mon, 22 Sep 2025 16:44:28 +0800
Subject: [PATCH 038/121] modify blas (#51)

* modify cmake for warpctc and warprnnt

* modify conv for tf32 and fp32

* modify conv kernel

* modify library to static library

* modify kernel

* modify fused_bias_dropout_residual_layer_norm

* modify compile

* modify blas

* modify blas

* modify blas

* modify blas
---
 backends/metax_gpu/CMakeLists.txt                    |  1 +
 .../metax_gpu/kernels/metax_kernel/metax_context.cc  | 12 ------------
 .../metax_gpu/kernels/metax_kernel/metax_context.h   |  4 +---
 backends/metax_gpu/patch/paddle.patch                |  1 -
 4 files changed, 2 insertions(+), 16 deletions(-)

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index 78b4c9c566b..b98f2bcc919 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -733,6 +733,7 @@ target_compile_definitions(
   ${TARGET_NAME}
   PUBLIC PADDLE_WITH_CUDA=1
          PADDLE_WITH_CUSTOM_DEVICE=1
+         mcblasContext=cublasContext
          GPUContext=CustomContext
          KPSContext=CustomContext
          STREAM_TYPE=cudaStream_t
diff --git a/backends/metax_gpu/kernels/metax_kernel/metax_context.cc b/backends/metax_gpu/kernels/metax_kernel/metax_context.cc
index a388387de45..6d86c81041f 100644
--- a/backends/metax_gpu/kernels/metax_kernel/metax_context.cc
+++ b/backends/metax_gpu/kernels/metax_kernel/metax_context.cc
@@ -56,16 +56,4 @@ void DnnWorkspaceHandle::ReallocWorkspace(size_t required_workspace_bytes) {
   allocation_.reset();
   allocation_ = allocator_->Allocate(required_workspace_bytes);
 }
-
-static std::function<blasLtHandle_t()> blaslt_handle_creator_{nullptr};
-static blasLtHandle_t blaslt_handle_{nullptr};
-static std::once_flag flag_blaslt_;
-
-static void InitBlasLtHandle(blasLtHandle_t* blaslt_handle) {
-#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060
-  mcblasLtCreate(blaslt_handle);
-#elif defined(PADDLE_WITH_HIP)
-  phi::dynload::hipblasLtCreate(blaslt_handle);
-#endif
-}
 }  // namespace phi
diff --git a/backends/metax_gpu/kernels/metax_kernel/metax_context.h b/backends/metax_gpu/kernels/metax_kernel/metax_context.h
index 2339e18a4a6..376981f27a4 100644
--- a/backends/metax_gpu/kernels/metax_kernel/metax_context.h
+++ b/backends/metax_gpu/kernels/metax_kernel/metax_context.h
@@ -27,9 +27,7 @@
 #include "paddle/phi/core/attribute.h"
 #include "paddle/phi/core/device_context.h"
 
-using blasLtHandle_t = struct mcblasLtContext*;
-
-blasLtHandle_t GetBlasLtHandle();
+cublasLtHandle_t GetBlasLtHandle();
 
 namespace phi {
 class DnnWorkspaceHandle {
diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch
index b7bdb953077..beefb730bf7 100755
--- a/backends/metax_gpu/patch/paddle.patch
+++ b/backends/metax_gpu/patch/paddle.patch
@@ -488,7 +488,6 @@ index 4eae698648..5c047723ea 100644
  #endif
    return block_dim >= kMaxBlockDim ? kMaxBlockDim : lwarpSize;
  }
-
 diff --git a/paddle/phi/kernels/funcs/math/context_project.h b/paddle/phi/kernels/funcs/math/context_project.h
 index 15e1a4a3c3..e4780538d7 100644
 --- a/paddle/phi/kernels/funcs/math/context_project.h

From 1977ca87be51518f59506d37c08790938e4c1345 Mon Sep 17 00:00:00 2001
From: jxwangmetax <189149612@qq.com>
Date: Mon, 22 Sep 2025 17:31:21 +0800
Subject: [PATCH 039/121] [metax] modify tf32 (#52)

* modify cmake for warpctc and warprnnt

* modify conv for tf32 and fp32

* modify conv kernel

* modify library to static library

* modify kernel

* modify fused_bias_dropout_residual_layer_norm

* modify compile

* modify blas

* modify blas

* modify blas

* modify blas

* modify context
---
 .../kernels/metax_kernel/metax_context.cc      | 18 ++++++++++++++++++
 .../kernels/metax_kernel/metax_context.h       |  2 ++
 2 files changed, 20 insertions(+)

diff --git a/backends/metax_gpu/kernels/metax_kernel/metax_context.cc b/backends/metax_gpu/kernels/metax_kernel/metax_context.cc
index 6d86c81041f..efddba5f00b 100644
--- a/backends/metax_gpu/kernels/metax_kernel/metax_context.cc
+++ b/backends/metax_gpu/kernels/metax_kernel/metax_context.cc
@@ -15,6 +15,24 @@
 #include "kernels/metax_kernel/metax_context.h"
 
 namespace phi {
+const bool allow_tf32_cublas = []() -> bool {
+  const char* v = std::getenv("ALLOW_TF32_CUBLAS");
+  if (v) {
+    return std::atoi(v);
+  }
+  return true;
+}();
+
+const bool allow_tf32_cudnn = []() -> bool {
+  const char* v = std::getenv("ALLOW_TF32_CUDNN");
+  if (v) {
+    return std::atoi(v);
+  }
+  return false;
+}();
+
+bool AllowTF32Cublas() { return allow_tf32_cublas; }
+bool AllowTF32Cudnn() { return allow_tf32_cudnn; }
 void DnnWorkspaceHandle::RunFuncSync(
     const std::function<void(void*)>& cudnn_func,
     size_t required_workspace_bytes,
diff --git a/backends/metax_gpu/kernels/metax_kernel/metax_context.h b/backends/metax_gpu/kernels/metax_kernel/metax_context.h
index 376981f27a4..2d761439089 100644
--- a/backends/metax_gpu/kernels/metax_kernel/metax_context.h
+++ b/backends/metax_gpu/kernels/metax_kernel/metax_context.h
@@ -30,6 +30,8 @@
 cublasLtHandle_t GetBlasLtHandle();
 
 namespace phi {
+bool AllowTF32Cublas();
+bool AllowTF32Cudnn();
 class DnnWorkspaceHandle {
  public:
   inline DnnWorkspaceHandle(Allocator* allocator, gpuStream_t stream)

From 1ae2618ac81e21e41b05797e08f1330eb504c4d5 Mon Sep 17 00:00:00 2001
From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Date: Mon, 22 Sep 2025 17:46:50 +0800
Subject: [PATCH 040/121] [Metax] update metax backend CI test (#53)

* [Metax] fix dgc & mklml compile product path problem

* [Metax] update metax_gpu CMakeLists.txt

* [Metax] organize documents

* [Metax] add log analysis script

* [Metax] update metax backend CI test
---
 backends/metax_gpu/tests/CMakeLists.txt | 192 +++++++++++-------------
 backends/metax_gpu/tests/default.txt    |  67 +++++++++
 backends/metax_gpu/tests/run_test.sh    |  56 ++++++-
 3 files changed, 202 insertions(+), 113 deletions(-)
 create mode 100644 backends/metax_gpu/tests/default.txt

diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt
index 795a3c5b8ac..ded54233f24 100755
--- a/backends/metax_gpu/tests/CMakeLists.txt
+++ b/backends/metax_gpu/tests/CMakeLists.txt
@@ -11,117 +11,95 @@ set(METAX_UNIT_TEST_PATH ${CMAKE_CURRENT_LIST_DIR}/unit_test)
 
 file(GLOB_RECURSE PYTHON_TEST_SCRIPTS "${METAX_UNIT_TEST_PATH}/*.py")
 
-list(
-  APPEND
-  PYTHON_TEST_SCRIPTS
-  ${PADDLE_LEGACY_TEST_PATH}/test_accuracy_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_tril_triu_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_where_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_split_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_fill_constant_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_empty_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_sign_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_cast_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_unbind_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_put_along_axis_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_maximum_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_accuracy_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_strided_slice_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_set_value_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_flatten_contiguous_range_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_top_k_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_subtract_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_greater_equal_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_top_k_v2_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_one_hot_v2_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_fill_any_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_reshape_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_index_put_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_bitwise_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_pad_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_uniform_random_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_scatter_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_cast_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_zeros_like_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_compare_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_shape_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_tril_triu_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_slice_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_index_put_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_bincount_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_assign_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_squared_l2_norm_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_fused_bias_act_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_expand_v2_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_adamw_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_gather_nd_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_concat_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_scatter_nd_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_floordiv_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_mul_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_einsum_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_randint_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_numel_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_scale_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_full_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_scatter_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_clip_op.py)
-
-list(
-  REMOVE_ITEM
-  PYTHON_TEST_SCRIPTS
-  # 精度问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py
-  # core.cudnnversion
-  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py
-  # op_test.py 里 self._get_places()接口的适配问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py
-  # device == "gpu" 适配问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py
-  # paddle-gpu 报错一致
-  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py
-  # paddle.device.cuda.get_device_properties
-  ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_randint_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_uniform_random_op.py
-  # needs check_grad with fp64 precision
-  ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py
-  # CUDAPinnedPlace 问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_slice_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_compare_op.py)
+if(NOT TEST_LIST_FILE)
+  message(
+    STATUS
+      "<TEST_LIST_FILE> is not set, default test list [ ${CMAKE_CURRENT_LIST_DIR}/default.txt ] will be used."
+  )
+  file(STRINGS ${CMAKE_CURRENT_LIST_DIR}/default.txt TEST_PROGRAMS)
+
+else()
+  if(NOT EXISTS ${TEST_LIST_FILE})
+    message(FATAL_ERROR "<TEST_LIST_FILE> is not exist, please check it again.")
+  endif()
+
+  file(STRINGS ${TEST_LIST_FILE} TEST_PROGRAMS)
+
+  if(NOT TEST_PROGRAMS)
+    message(FATAL_ERROR "<TEST_LIST_FILE> is empty.")
+  endif()
+
+  set(PYTHON_TEST_SCRIPTS "")
+endif()
+
+foreach(test_name ${TEST_PROGRAMS})
+  set(CURRENT_TEST_PROGRAM ${PADDLE_LEGACY_TEST_PATH}/${test_name}.py)
+  if(NOT EXISTS ${CURRENT_TEST_PROGRAM})
+    message(WARNING "${CURRENT_TEST_PROGRAM} is not exist, skip it.")
+  else()
+    list(APPEND PYTHON_TEST_SCRIPTS ${CURRENT_TEST_PROGRAM})
+  endif()
+endforeach()
 
 list(REMOVE_DUPLICATES PYTHON_TEST_SCRIPTS)
+
+if(NOT TEST_LIST_FILE)
+  list(
+    REMOVE_ITEM
+    PYTHON_TEST_SCRIPTS
+    # 精度问题
+    ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py
+    ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py
+    ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py
+    # core.cudnnversion
+    ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py
+    ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py
+    ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py
+    ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py
+    # op_test.py 里 self._get_places()接口的适配问题
+    ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py
+    ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py
+    # device == "gpu" 适配问题
+    ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py
+    # paddle-gpu 报错一致
+    ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py
+    ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py
+    ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py
+    ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py
+    # paddle.device.cuda.get_device_properties
+    ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py
+    ${PADDLE_LEGACY_TEST_PATH}/test_randint_op.py
+    ${PADDLE_LEGACY_TEST_PATH}/test_uniform_random_op.py
+    # needs check_grad with fp64 precision
+    ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py
+    # CUDAPinnedPlace 问题
+    ${PADDLE_LEGACY_TEST_PATH}/test_slice_op.py
+    ${PADDLE_LEGACY_TEST_PATH}/test_compare_op.py)
+endif()
+
+if(LOG_OUTPUT_DIR AND NOT EXISTS ${LOG_OUTPUT_DIR})
+  file(MAKE_DIRECTORY ${LOG_OUTPUT_DIR})
+  message(WARNING "${LOG_OUTPUT_DIR} is not exist, create it now.")
+endif()
+
 foreach(test_script ${PYTHON_TEST_SCRIPTS})
   get_filename_component(test_name ${test_script} NAME_WE)
 
-  add_test(
-    NAME "python_${test_name}"
-    COMMAND ${Python_EXECUTABLE} ${test_script}
-    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+  if(LOG_OUTPUT_DIR)
+    set(test_log_file "${LOG_OUTPUT_DIR}/${test_name}.log")
+
+    add_test(
+      NAME "python_${test_name}"
+      COMMAND sh -c
+              "${Python_EXECUTABLE} ${test_script} > ${test_log_file} 2>&1"
+      WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+
+  else()
+    add_test(
+      NAME "python_${test_name}"
+      COMMAND ${Python_EXECUTABLE} ${test_script}
+      WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+  endif()
+
   set_tests_properties("python_${test_name}" PROPERTIES TIMEOUT 360)
 endforeach()
diff --git a/backends/metax_gpu/tests/default.txt b/backends/metax_gpu/tests/default.txt
new file mode 100644
index 00000000000..8e2c3bcdd7e
--- /dev/null
+++ b/backends/metax_gpu/tests/default.txt
@@ -0,0 +1,67 @@
+test_accuracy_op
+test_tril_triu_op
+test_where_op
+test_split_op
+test_fill_constant_op
+test_empty_op
+test_sign_op
+test_cast_op
+test_index_add_op
+test_unbind_op
+test_put_along_axis_op
+test_layer_norm_op
+test_maximum_op
+test_accuracy_op
+test_strided_slice_op
+test_sum_op
+test_set_value_op
+test_flatten_contiguous_range_op
+test_top_k_op
+test_subtract_op
+test_softmax_op
+test_cumsum_op
+test_greater_equal_op
+test_elementwise_div_op
+test_top_k_v2_op
+test_stack_op
+test_one_hot_v2_op
+test_fill_any_op
+test_gather_op
+test_reshape_op
+test_index_put_op
+test_bitwise_op
+test_max_op
+test_pad_op
+test_elementwise_pow_op
+test_uniform_random_op
+test_scatter_op
+test_cast_op
+test_zeros_like_op
+test_compare_op
+test_shape_op
+test_tril_triu_op
+test_slice_op
+test_elementwise_add_op
+test_index_put_op
+test_bincount_op
+test_assign_op
+test_logical_op
+test_squared_l2_norm_op
+test_mean_op
+test_fused_bias_act_op
+test_expand_v2_op
+test_adamw_op
+test_gather_nd_op
+test_concat_op
+test_scatter_nd_op
+test_elementwise_floordiv_op
+test_elementwise_mul_op
+test_transpose_op
+test_einsum_op
+test_randint_op
+test_c_embedding_op
+test_numel_op
+test_scale_op
+test_softmax_with_cross_entropy_op
+test_full_op
+test_scatter_op
diff --git a/backends/metax_gpu/tests/run_test.sh b/backends/metax_gpu/tests/run_test.sh
index 7d1e8e072a9..b9e8ec5b5cc 100755
--- a/backends/metax_gpu/tests/run_test.sh
+++ b/backends/metax_gpu/tests/run_test.sh
@@ -2,13 +2,13 @@
 #!/bin/bash
 
 # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -29,10 +29,54 @@ export
 rm -r build
 mkdir -p build && cd build
 
-cmake ..
 
+TEST_LOG_LEVEL=0
+TEST_LIST_FILE=""
+TEST_LOG_OUTPUT_DIR=""
+TEST_PARALLEL_NUM=10
 
-cmake --build .
+while getopts "i:o:v:j:h" opt; do
+  case "$opt" in
+    i)
+      TEST_LIST_FILE="$OPTARG"
+      ;;
+    o)
+      TEST_LOG_OUTPUT_DIR="$OPTARG"
+      echo "Set log output dir [ $TEST_LOG_OUTPUT_DIR ]"
+      ;;
+    v)
+      TEST_LOG_LEVEL=$OPTARG
+      ;;
+    j)
+      TEST_PARALLEL_NUM="$OPTARG"
+      ;;
+    h)
+      echo "用法：$0 -i <测试列表文件> -o <日志输出路径> ..."
+      echo "选项说明："
+      echo "  -i  测试程序列表文件"
+      echo "  -o  日志输出路径"
+      echo "  -v  GLOG_v 日志等级"
+      echo "  -j  ctest 测试并行数量"
+      echo "  -h  显示帮助"
+      exit 0
+      ;;
+    \?)
+      echo "error: unknow option '-$OPTARG'."
+      exit 1
+      ;;
+    :)
+      echo "error option '-$OPTARG' must have parameter."
+      exit 1
+      ;;
+  esac
+done
+
+
+export GLOG_v=$TEST_LOG_LEVEL
 
 
-ctest -j10 --output-on-failure
+cmake .. -DTEST_LIST_FILE=$TEST_LIST_FILE -DLOG_OUTPUT_DIR=$TEST_LOG_OUTPUT_DIR
+
+cmake --build .
+
+ctest -j$TEST_PARALLEL_NUM --output-on-failure

From 76d5eb0245904cc209e52dd9fa92dea990db1ad7 Mon Sep 17 00:00:00 2001
From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Date: Tue, 23 Sep 2025 09:43:37 +0800
Subject: [PATCH 041/121] [Metax] fix log_analysis.py bug (#54)

* [Metax] fix dgc & mklml compile product path problem

* [Metax] update metax_gpu CMakeLists.txt

* [Metax] organize documents

* [Metax] add log analysis script

* [Metax] update metax backend CI test

* [Metax] fix log_analysis.py bug
---
 .../metax_gpu/tests/scripts/log_analysis.py   | 21 +++++++++----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/backends/metax_gpu/tests/scripts/log_analysis.py b/backends/metax_gpu/tests/scripts/log_analysis.py
index c0716f5b6f5..963d50751f7 100644
--- a/backends/metax_gpu/tests/scripts/log_analysis.py
+++ b/backends/metax_gpu/tests/scripts/log_analysis.py
@@ -153,7 +153,6 @@ def run(self):
 
                 cur_res_type = TestResult.FAILURE
                 cur_sub_type = "other"
-                pre_line = None
                 finish_early = False
 
                 try:
@@ -172,19 +171,19 @@ def run(self):
                                 if finish_early:
                                     break
 
-                            pre_line = line
                             if finish_early:
                                 break
 
-                        if "OK" in pre_line:
-                            cur_res_type = TestResult.OK
-                            cur_sub_type = None
-                            for sub_type, sub_type_params in self.__classify_data[
-                                cur_res_type.value
-                            ].items():
-                                for rule in sub_type_params["rule"]:
-                                    if rule in line:
-                                        cur_sub_type = sub_type
+                            if len(line) >= 2 and line[:2] == "OK":
+                                cur_res_type = TestResult.OK
+                                cur_sub_type = None
+                                for sub_type, sub_type_params in self.__classify_data[
+                                    cur_res_type.value
+                                ].items():
+                                    for rule in sub_type_params["rule"]:
+                                        if rule in line:
+                                            cur_sub_type = sub_type
+                                break
 
                         op_name = filename.split(".")
                         if cur_sub_type is None:

From 9c17b6e0867119ea51c1c4230603f2a34137ac68 Mon Sep 17 00:00:00 2001
From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Date: Tue, 23 Sep 2025 11:09:44 +0800
Subject: [PATCH 042/121] [Metax] update metax CI CMakeLists & scripts (#56)

* [Metax] fix dgc & mklml compile product path problem

* [Metax] update metax_gpu CMakeLists.txt

* [Metax] organize documents

* [Metax] add log analysis script

* [Metax] update metax backend CI test

* [Metax] fix log_analysis.py bug

* [Metax] update metax CI CMakeLists & scripts
---
 .github/workflows/metax_work.yaml             |  2 +-
 backends/metax_gpu/tests/CMakeLists.txt       |  4 ++-
 backends/metax_gpu/tests/run_test.sh          |  2 +-
 .../metax_gpu/tests/scripts/classify.json     | 31 +++++++++++++++++--
 4 files changed, 33 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/metax_work.yaml b/.github/workflows/metax_work.yaml
index 51c0c62cef6..aff530d475c 100644
--- a/.github/workflows/metax_work.yaml
+++ b/.github/workflows/metax_work.yaml
@@ -51,4 +51,4 @@ jobs:
       - name: run test
         run: |
           cd backends/metax_gpu/tests
-          bash run_test.sh
+          bash run_test.sh -j 16
diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt
index ded54233f24..5b7be15e4f9 100755
--- a/backends/metax_gpu/tests/CMakeLists.txt
+++ b/backends/metax_gpu/tests/CMakeLists.txt
@@ -47,6 +47,8 @@ if(NOT TEST_LIST_FILE)
   list(
     REMOVE_ITEM
     PYTHON_TEST_SCRIPTS
+    # Metax unit test
+    ${METAX_UNIT_TEST_PATH}/test_matmul_op__metax.py
     # 精度问题
     ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py
     ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py
@@ -101,5 +103,5 @@ foreach(test_script ${PYTHON_TEST_SCRIPTS})
       WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
   endif()
 
-  set_tests_properties("python_${test_name}" PROPERTIES TIMEOUT 360)
+  set_tests_properties("python_${test_name}" PROPERTIES TIMEOUT 600)
 endforeach()
diff --git a/backends/metax_gpu/tests/run_test.sh b/backends/metax_gpu/tests/run_test.sh
index b9e8ec5b5cc..7f2277fe4fb 100755
--- a/backends/metax_gpu/tests/run_test.sh
+++ b/backends/metax_gpu/tests/run_test.sh
@@ -33,7 +33,7 @@ mkdir -p build && cd build
 TEST_LOG_LEVEL=0
 TEST_LIST_FILE=""
 TEST_LOG_OUTPUT_DIR=""
-TEST_PARALLEL_NUM=10
+TEST_PARALLEL_NUM=1
 
 while getopts "i:o:v:j:h" opt; do
   case "$opt" in
diff --git a/backends/metax_gpu/tests/scripts/classify.json b/backends/metax_gpu/tests/scripts/classify.json
index b97255adc3d..ca92ad4a0a4 100644
--- a/backends/metax_gpu/tests/scripts/classify.json
+++ b/backends/metax_gpu/tests/scripts/classify.json
@@ -7,13 +7,38 @@
 
     "FAILED":{
         "precision":{
-            "rule":["Mismatched elements"]
+            "rule":["Mismatched elements",
+            "RuntimeError: Jacobian mismatch for output 0 in y with respect to input 0 in x on Place(metax_gpu:0),",
+            "AssertionError: np.float64("]
         },
         "api":{
-            "rule":["(PermissionDenied) Cannot use CUDAPinnedPlace", "ValueError: The API paddle.device.cuda.get_device_properties", "TypeError: paddle.index_add api"]
+            "rule":["(PermissionDenied) Cannot use CUDAPinnedPlace",
+            "ValueError: The API paddle.device.cuda.get_device_properties",
+            "TypeError: paddle.index_add api",
+            "RuntimeError: (Unavailable) Paddle is not compiled with CUDA.",
+            "ValueError: invalid literal for int() with base",
+            "AttributeError: module 'paddle.base.libpaddle' has no attribute 'cudnn_version'",
+            "RuntimeError: Pinning memory is not supported for Place(metax_gpu:0)",
+            "PreconditionNotMetError: Context place error, excepted GPUPlace, but actually Place(metax_gpu:0).",
+            "AttributeError: module 'paddle.base.libpaddle.eager.ops.legacy' has no attribute 'fused_gemm_epilogue'",
+            "ValueError: The device should not be 'gpu', since PaddlePaddle is not compiled with CUDA"]
         },
         "missing":{
-            "rule":["missing metax_gpu kernel", "UnimplementedError: There are no kernels which are registered"]
+            "rule":["missing metax_gpu kernel",
+            "missing ONEDNN kernel",
+            "UnimplementedError: There are no kernels which are registered",
+            "symbol lookup error:",
+            "RuntimeError: (NotFound) The kernel"]
+        },
+        "core_dumped":{
+            "rule":["Segmentation fault"]
+        },
+        "input_dim":{
+            "rule":["ValueError: (InvalidArgument) The Input(",
+            "Test range of input is out of bound"]
+        },
+        "array_dim":{
+            "rule":["Arrays are not equal"]
         },
         "file_not_found":{
             "rule":["FileNotFoundError:"]

From 51c98a20020ba61b2bfab54abf11668a9f40e0b6 Mon Sep 17 00:00:00 2001
From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Date: Tue, 23 Sep 2025 19:11:49 +0800
Subject: [PATCH 043/121] [Metax] fix MatmulKernel problem (#57)

* [Metax] fix dgc & mklml compile product path problem

* [Metax] update metax_gpu CMakeLists.txt

* [Metax] organize documents

* [Metax] add log analysis script

* [Metax] update metax backend CI test

* [Metax] fix log_analysis.py bug

* [Metax] update metax CI CMakeLists & scripts

* [Metax] fix MatmulKernel problem

* [Metax] update metax CI program
---
 .../kernels/impl/matmul_kernel_impl.h         |  19 +-
 backends/metax_gpu/tests/CMakeLists.txt       |   2 +-
 backends/metax_gpu/tests/default.txt          | 258 ++++++++++++
 ...r_equal.py => test_greater_equal_metax.py} |   0
 ...ild_src_rank_and_local_expert_id_metax.py} |   0
 ...cubate_expand_modality_expert_id_metax.py} |   0
 ....py => test_incubate_moe_combine_metax.py} |   0
 ...e_dispatch_partial_nosoftmaxtopk_metax.py} |   0
 ..._moe_gate_dispatch_w_permute_bwd_metax.py} |   0
 ...bate_moe_gate_dispatch_w_permute_metax.py} |   0
 ...layer_norm.py => test_layer_norm_metax.py} |   0
 ...l_op__metax.py => test_matmul_op_metax.py} |   0
 ...mpling.py => test_top_p_sampling_metax.py} |   0
 .../tests/unittest/test_matmul_op__metax.py   | 395 ------------------
 14 files changed, 272 insertions(+), 402 deletions(-)
 rename backends/metax_gpu/tests/unit_test/{test_greater_equal.py => test_greater_equal_metax.py} (100%)
 rename backends/metax_gpu/tests/unit_test/{test_incubate_build_src_rank_and_local_expert_id.py => test_incubate_build_src_rank_and_local_expert_id_metax.py} (100%)
 rename backends/metax_gpu/tests/unit_test/{test_incubate_expand_modality_expert_id.py => test_incubate_expand_modality_expert_id_metax.py} (100%)
 rename backends/metax_gpu/tests/unit_test/{test_incubate_moe_combine.py => test_incubate_moe_combine_metax.py} (100%)
 rename backends/metax_gpu/tests/unit_test/{test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk.py => test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk_metax.py} (100%)
 rename backends/metax_gpu/tests/unit_test/{test_incubate_moe_gate_dispatch_w_permute_bwd.py => test_incubate_moe_gate_dispatch_w_permute_bwd_metax.py} (100%)
 rename backends/metax_gpu/tests/unit_test/{test_incubate_moe_gate_dispatch_w_permute.py => test_incubate_moe_gate_dispatch_w_permute_metax.py} (100%)
 rename backends/metax_gpu/tests/unit_test/{test_layer_norm.py => test_layer_norm_metax.py} (100%)
 rename backends/metax_gpu/tests/unit_test/{test_matmul_op__metax.py => test_matmul_op_metax.py} (100%)
 rename backends/metax_gpu/tests/unit_test/{test_top_p_sampling.py => test_top_p_sampling_metax.py} (100%)
 delete mode 100644 backends/metax_gpu/tests/unittest/test_matmul_op__metax.py

diff --git a/backends/metax_gpu/kernels/impl/matmul_kernel_impl.h b/backends/metax_gpu/kernels/impl/matmul_kernel_impl.h
index bf228c81291..5221bd93ba9 100755
--- a/backends/metax_gpu/kernels/impl/matmul_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/matmul_kernel_impl.h
@@ -40,6 +40,7 @@ limitations under the License. */
 #if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060 && 0
 #include "paddle/phi/kernels/autotune/auto_tune_base.h"
 #endif
+#include "paddle/phi/kernels/full_kernel.h"
 // clang-format on
 namespace phi {
 
@@ -1485,16 +1486,22 @@ void MatmulKernel(const Context& ctx,
                   bool transpose_x,
                   bool transpose_y,
                   DenseTensor* out) {
-  PADDLE_ENFORCE_NE(
+  if (x.numel() == 0 || y.numel() == 0) {
+    // input shape [1, 1, 5, 0], [1, 1, 0, 5], result shape is [1, 1, 5, 5]
+    phi::Full<T, Context>(
+        ctx, phi::IntArray(common::vectorize(out->dims())), 0, out);
+    return;
+  }
+  PADDLE_ENFORCE_GE(
       common::product(x.dims()),
       0,
-      phi::errors::InvalidArgument("The Input(X) dims size must not be equal 0,"
-                                   " but reviced dims size is 0. "));
-  PADDLE_ENFORCE_NE(
+      common::errors::InvalidArgument(
+          "The dims of Input(X) should be greater than or equal to 0."));
+  PADDLE_ENFORCE_GE(
       common::product(y.dims()),
       0,
-      phi::errors::InvalidArgument("The Input(Y) dims size must not be equal 0,"
-                                   " but reviced dims size is 0. "));
+      common::errors::InvalidArgument(
+          "The dims of Input(Y) should be greater than or equal to 0."));
   const std::vector<std::int64_t> x_dims = common::vectorize(x.dims());
   const std::vector<std::int64_t> y_dims = common::vectorize(y.dims());
   MatmulJudgeDtypeKernel<Context, T>(
diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt
index 5b7be15e4f9..e8b11d347d9 100755
--- a/backends/metax_gpu/tests/CMakeLists.txt
+++ b/backends/metax_gpu/tests/CMakeLists.txt
@@ -48,7 +48,7 @@ if(NOT TEST_LIST_FILE)
     REMOVE_ITEM
     PYTHON_TEST_SCRIPTS
     # Metax unit test
-    ${METAX_UNIT_TEST_PATH}/test_matmul_op__metax.py
+    ${METAX_UNIT_TEST_PATH}/test_matmul_op_metax.py
     # 精度问题
     ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py
     ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py
diff --git a/backends/metax_gpu/tests/default.txt b/backends/metax_gpu/tests/default.txt
index 8e2c3bcdd7e..9f073d7e92f 100644
--- a/backends/metax_gpu/tests/default.txt
+++ b/backends/metax_gpu/tests/default.txt
@@ -65,3 +65,261 @@ test_scale_op
 test_softmax_with_cross_entropy_op
 test_full_op
 test_scatter_op
+test_assign_pos_op
+test_index_select_compatible
+test_dequantize_abs_max_op
+test_fill_any_op
+test_fractional_max_pool3d_api
+test_nll_loss
+test_is_empty_op
+test_norm_nn_grad
+test_index_fill
+test_floor
+test_slice_scatter
+test_nn_matmul_v2_grad
+test_matmul_op_with_head
+test_broadcast_shape
+test_fill_constant_op
+test_decayed_adagrad_op
+test_count_nonzero_api
+test_tensor_fill_
+test_minimum_op
+test_sigmoid_focal_loss
+test_dynamic_rnn_stop_gradient
+test_ops_roi_align
+test_split_op
+test_sum_decorator
+test_share_data_op
+test_assert_op
+test_masked_select_op
+test_tensor_fill_diagonal_tensor_
+test_unfold_op
+test_scatter_add_op
+test_flatten_contiguous_range_op
+test_empty_like_op
+test_logsumexp
+test_multiply
+test_ceil_op
+test_nearest_interp_v2_op
+test_incubate_expand_modality_expert_id
+test_bmm_op
+test_prelu_op
+test_batch_fc_op
+test_masked_fill
+test_overlap_add_op
+test_update_loss_scaling_op
+test_floor_divide_op
+test_increment
+test_complex_abs
+test_gather_compatible
+test_functional_conv2d
+test_group_norm_op_v2
+test_conv2d_transpose_op_depthwise_conv
+test_diagonal_op
+test_maximum_op
+test_erfinv_op
+test_interp_recompute_scale_factor
+test_embedding_scale_grad_by_freq
+test_diagonal_scatter
+test_higher_dim_scatter
+test_infer_shape
+test_flip
+test_fused_bias_dropout_residual_layer_norm_op
+test_greater_equal_op
+test_add_op
+test_cartesian_prod
+test_uniform_random_inplace_op
+test_feed_fetch_method
+test_pow_op
+test_conv3d_transpose_op
+test_add_position_encoding_op
+test_imperative_data_loader_base
+test_rnn_cell_api
+test_linspace
+test_adaptive_log_softmax_with_loss
+test_cross_entropy2_op
+test_complex_reshape
+test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk
+test_gaussian_nll_loss
+test_log_normal
+test_unstack_op
+test_expand_as_v2_op
+test_dequantize_log_op
+test_complex_sum_layer
+test_slice_var
+test_scale_op
+test_hinge_embedding_loss
+test_set_value_op
+test_merged_adam_op
+test_index_sample_op
+test_cuda_empty_cache
+test_add_n_op
+test_randint_like
+test_unique_consecutive_op
+test_fill_diagonal_tensor_op
+test_log_loss_op
+test_linalg_cholesky_inverse
+test_numel_op
+test_tril_triu_op
+test_adaptive_max_pool2d
+test_sigmoid_cross_entropy_with_logits_grad_with_auto_grad
+test_complex_cast
+test_poisson_nll_loss
+test_empty_op
+test_functional_conv1d_transpose
+test_clip_by_norm_op
+test_box_clip_op
+test_clip_op
+test_grad_clip_minimize
+test_less_than_op
+test_adamw_op
+test_data_feeder
+test_top_p_sampling
+test_subtract_op
+test_batch_norm_op_v2
+test_cosine_embedding_loss
+test_imperative_data_parallel
+test_sigmoid
+test_adaptive_max_pool3d
+test_roll_op
+test_index_put_op
+test_assign_op
+test_amp_check_finite_and_scale_op
+test_strided_slice_op
+test_label_smooth_functional
+test_c_softmax_with_cross_entropy_op
+test_sync_batch_norm_op_convert
+test_tensor_fill_diagonal_tensor
+test_bfloat16_embedding
+test_gelu_op
+test_full_
+test_concat_op
+test_imperative_data_loader_process
+test_tensor_fill_diagonal_
+test_clip_grad_norm_
+test_eager_deletion_padding_rnn
+test_pool2d_api
+test_clip_grad_value_
+test_isfinite_v2_op
+test_nn_sigmoid_op
+test_adaptive_avg_pool2d
+test_size
+test_sigmoid_cross_entropy_with_logits_op
+test_scatter_reduce_op
+test_rsqrt
+test_conv2d_transpose_layer
+test_scatter_compatible
+test_scatter_nd_op
+test_add_op_fluid
+test_unique
+test_compat_split_static
+test_stack_op
+test_tile_op
+test_adam_optimizer_fp32_fp64
+test_batch_norm_op
+test_gather_nd_op
+test_pow
+test_executor_check_fetch_list
+test_inplace_softmax_with_cross_entropy
+test_cos
+test_imperative_parallel_coalesce_split
+test_grid_sample_function
+test_rnn_decode_api
+test_triu_indices_op
+test_binary_cross_entropy_with_logits_op
+test_mean_op_v1
+test_round_op
+test_assign_pos_op_dygraph
+test_nn_functional_embedding_static
+test_norm_op
+test_unbind_op
+test_bilinear_interp_v2_op
+test_tensor_data_ptr
+test_norm_all
+test_conv1d_transpose_layer
+test_arange
+test_compat_unfold
+test_fetch_var
+test_index_select_op
+test_sign_op
+test_functional_conv3d_transpose
+test_uniform_random_bf16_op
+test_gather_tree_op
+test_histogram_bin_edges_op
+test_fractional_max_pool2d_api
+test_fill_any_like_op
+test_alpha_dropout
+test_conv3d_layer
+test_compat_pad
+test_box_coder_op
+test_full_op
+test_repeat_interleave_op
+test_reshape_op
+test_embedding_renorm
+test_log_softmax
+test_pad3d_op
+test_diag_v2
+test_complex_transpose
+test_prior_box_op
+test_square_error_cost
+test_fused_rotary_position_embedding
+test_gru_rnn_op
+test_restrict_nonzero
+test_dygraph_weight_norm
+test_conv_transpose_nn_grad
+test_incubate_build_src_rank_and_local_expert_id
+test_elementwise_nn_grad
+test_fused_bias_dropout_residual_layer_norm_op_api
+test_simple_rnn_op
+test_data_generator
+test_compat_split
+test_scatter_add_inplace_op
+test_c_softmax_with_multi_label_cross_entropy_op
+test_conv3d_transpose_layer
+test_less_equal_op
+test_gumbel_softmax_op
+test_assign_value_op
+test_cast_op
+test_fused_bias_act_op
+test_conv3d_transpose_part2_op
+test_log
+test_data
+test_incubate_moe_combine
+test_masked_scatter
+test_silu_op
+test_select_scatter_op
+test_adagrad_op_v2
+test_functional_conv3d
+test_bce_with_logits_loss
+test_argsort_op
+test_layer_norm_op_v2
+test_adaptive_max_pool1d
+test_shard_index_op
+test_cuda_max_memory_allocated
+test_roi_align_op
+test_sin
+test_take
+test_take_along_dim
+test_complex_matmul
+test_reduce_as_op
+test_log_normal_inplace
+test_repeat
+test_fetch_lod_tensor_array
+test_partial_concat_op
+test_accuracy_op
+test_l1_norm_op
+test_bce_loss
+test_fused_conv2d_add_act_op
+test_tril_indices_op
+test_cross_entropy_op
+test_blha_get_max_len_op
+test_softmax_mask_fuse_op
+test_diag_embed
+test_one_hot_v2_op
+test_selu_op
+test_huber_loss_op
+test_einsum_op
+test_dygraph_spectral_norm
+test_block_diag
+test_index_elementwise
+test_matmul_out
diff --git a/backends/metax_gpu/tests/unit_test/test_greater_equal.py b/backends/metax_gpu/tests/unit_test/test_greater_equal_metax.py
similarity index 100%
rename from backends/metax_gpu/tests/unit_test/test_greater_equal.py
rename to backends/metax_gpu/tests/unit_test/test_greater_equal_metax.py
diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_build_src_rank_and_local_expert_id.py b/backends/metax_gpu/tests/unit_test/test_incubate_build_src_rank_and_local_expert_id_metax.py
similarity index 100%
rename from backends/metax_gpu/tests/unit_test/test_incubate_build_src_rank_and_local_expert_id.py
rename to backends/metax_gpu/tests/unit_test/test_incubate_build_src_rank_and_local_expert_id_metax.py
diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_expand_modality_expert_id.py b/backends/metax_gpu/tests/unit_test/test_incubate_expand_modality_expert_id_metax.py
similarity index 100%
rename from backends/metax_gpu/tests/unit_test/test_incubate_expand_modality_expert_id.py
rename to backends/metax_gpu/tests/unit_test/test_incubate_expand_modality_expert_id_metax.py
diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_moe_combine.py b/backends/metax_gpu/tests/unit_test/test_incubate_moe_combine_metax.py
similarity index 100%
rename from backends/metax_gpu/tests/unit_test/test_incubate_moe_combine.py
rename to backends/metax_gpu/tests/unit_test/test_incubate_moe_combine_metax.py
diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk.py b/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk_metax.py
similarity index 100%
rename from backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk.py
rename to backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk_metax.py
diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_bwd.py b/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_bwd_metax.py
similarity index 100%
rename from backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_bwd.py
rename to backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_bwd_metax.py
diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute.py b/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_metax.py
similarity index 100%
rename from backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute.py
rename to backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_metax.py
diff --git a/backends/metax_gpu/tests/unit_test/test_layer_norm.py b/backends/metax_gpu/tests/unit_test/test_layer_norm_metax.py
similarity index 100%
rename from backends/metax_gpu/tests/unit_test/test_layer_norm.py
rename to backends/metax_gpu/tests/unit_test/test_layer_norm_metax.py
diff --git a/backends/metax_gpu/tests/unit_test/test_matmul_op__metax.py b/backends/metax_gpu/tests/unit_test/test_matmul_op_metax.py
similarity index 100%
rename from backends/metax_gpu/tests/unit_test/test_matmul_op__metax.py
rename to backends/metax_gpu/tests/unit_test/test_matmul_op_metax.py
diff --git a/backends/metax_gpu/tests/unit_test/test_top_p_sampling.py b/backends/metax_gpu/tests/unit_test/test_top_p_sampling_metax.py
similarity index 100%
rename from backends/metax_gpu/tests/unit_test/test_top_p_sampling.py
rename to backends/metax_gpu/tests/unit_test/test_top_p_sampling_metax.py
diff --git a/backends/metax_gpu/tests/unittest/test_matmul_op__metax.py b/backends/metax_gpu/tests/unittest/test_matmul_op__metax.py
deleted file mode 100644
index 7545e16d14d..00000000000
--- a/backends/metax_gpu/tests/unittest/test_matmul_op__metax.py
+++ /dev/null
@@ -1,395 +0,0 @@
-#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import unittest
-from tests.op_test import OpTest
-import paddle
-
-paddle.enable_static()
-SEED = 2022
-
-
-def reference_matmul(X, Y, transpose_X=False, transpose_Y=False, scale=1.0):
-    """Reference forward implementation using np.matmul."""
-    # np.matmul does not support the transpose flags, so we manually
-    # transpose X and Y appropriately.
-    if transpose_X:
-        if X.ndim == 1:
-            X = X.reshape((X.size,))
-        elif X.ndim == 2:
-            X = X.T
-        else:
-            dim = [i for i in range(len(X.shape))]
-            dim[-1], dim[len(X.shape) - 2] = dim[len(X.shape) - 2], dim[-1]
-            X = np.transpose(X, tuple(dim))
-    if transpose_Y:
-        if Y.ndim == 1:
-            Y = Y.reshape((Y.size,))
-        else:
-            dim = [i for i in range(len(Y.shape))]
-            dim[-1], dim[len(Y.shape) - 2] = dim[len(Y.shape) - 2], dim[-1]
-            Y = np.transpose(Y, tuple(dim))
-
-    Out = np.matmul(X, Y)
-    if abs(scale - 1.0) > 1e-09:
-        Out = Out * scale
-    return Out
-
-
-class TestBmmOp(OpTest):
-    """
-    case 0
-    """
-
-    def set_metax_gpu(self):
-        self.__class__.use_custom_device = True
-        self.place = paddle.CustomPlace("metax_gpu", 0)
-
-    def config(self):
-        self.x_shape = (10, 2, 5)
-        self.y_shape = (10, 5, 8)
-
-    def init_kernel_type(self):
-        self.dtype = "float32"
-
-    def setUp(self):
-        self.set_metax_gpu()
-        self.init_kernel_type()
-        self.config()
-        self.op_type = "bmm"
-        x = np.random.random(self.x_shape).astype(self.dtype)
-        y = np.random.random(self.y_shape).astype(self.dtype)
-        # -0.1 ~ 0.1
-        x = -0.1 + 0.2 * x
-        y = -0.1 + 0.2 * y
-        result = reference_matmul(x, y)
-        result = result.astype(self.dtype)
-        self.inputs = {
-            "X": x,
-            "Y": y,
-        }
-        self.outputs = {"Out": result}
-
-    def test_check_output(self):
-        self.check_output_with_place(self.place, atol=1e-3)
-
-    def test_check_grad(self):
-        self.check_grad_with_place(self.place, ["X", "Y"], "Out")
-
-
-class TestBmmOp1(TestBmmOp):
-    """
-    case 1
-    """
-
-    def config(self):
-        self.x_shape = (40, 10, 10)
-        self.y_shape = (40, 10, 10)
-
-    def test_check_output(self):
-        self.check_output_with_place(self.place, atol=1e-3)
-
-    def test_check_grad(self):
-        self.check_grad_with_place(self.place, ["X", "Y"], "Out")
-
-
-class TestBmmOp2(TestBmmOp):
-    """
-    case 2
-    """
-
-    def config(self):
-        self.x_shape = (4, 10, 80)
-        self.y_shape = (4, 80, 1)
-
-    def test_check_grad(self):
-        self.check_grad_with_place(
-            self.place,
-            ["X", "Y"],
-            "Out",
-            max_relative_error=1e-2,
-        )
-
-    def test_check_output(self):
-        self.check_output_with_place(self.place, atol=1e-3)
-
-
-class TestMatMulOp(OpTest):
-    """
-    basic case
-    """
-
-    def setUp(self):
-        self.set_metax_gpu()
-        self.op_type = "matmul_v2"
-        self.init_dtype()
-        self.init_alpha()
-        self.config()
-
-        X = np.random.random(self.x_shape).astype(self.dtype)
-        Y = np.random.random(self.y_shape).astype(self.dtype)
-        # -0.1 ~ 0.1
-        X = -0.1 + 0.2 * X
-        Y = -0.1 + 0.2 * Y
-        Out = reference_matmul(X, Y, self.transpose_X, self.transpose_Y, self.alpha)
-        Out = Out.astype(self.dtype)
-        self.inputs = {"X": X, "Y": Y}
-        self.attrs = {
-            "trans_x": self.transpose_X,
-            "trans_y": self.transpose_Y,
-            "alpha": self.alpha,
-        }
-        self.outputs = {"Out": Out}
-
-    def set_metax_gpu(self):
-        self.__class__.use_custom_device = True
-        self.place = paddle.CustomPlace("metax_gpu", 0)
-
-    def config(self):
-        self.x_shape = (100,)
-        self.y_shape = (100,)
-        self.transpose_X = False
-        self.transpose_Y = False
-
-    def init_alpha(self):
-        self.alpha = 1.0
-
-    def init_dtype(self):
-        self.dtype = "float32"
-
-    def test_check_output(self):
-        self.check_output_with_place(self.place, atol=1e-7)
-
-    def test_check_grad_normal(self):
-        self.check_grad_with_place(self.place, ["X", "Y"], "Out")
-
-
-class TestMatMulOp1(TestMatMulOp):
-    """
-    case x_ndim == 1, y_ndim != 1
-    """
-
-    def config(self):
-        self.x_shape = (100,)
-        self.y_shape = (1, 3, 2, 100)
-        self.transpose_X = False
-        self.transpose_Y = True
-
-
-class TestMatMulOp2(TestMatMulOp):
-    """
-    case x_ndim != 1, y_ndim == 1
-    """
-
-    def config(self):
-        self.x_shape = (1, 2, 100, 1)
-        self.y_shape = (100,)
-        self.transpose_X = True
-        self.transpose_Y = False
-
-
-class TestMatMulOp3(TestMatMulOp):
-    """
-    case [M, K] x [K, N] = [M, N]
-    """
-
-    def config(self):
-        self.x_shape = (2, 100)
-        self.y_shape = (100, 2)
-        self.transpose_X = False
-        self.transpose_Y = False
-
-
-class TestMatMulOp4(TestMatMulOp):
-    """
-    case [M, K] x [K, N] = [M, N]
-    """
-
-    def config(self):
-        self.x_shape = (2, 100)
-        self.y_shape = (2, 100)
-        self.transpose_X = False
-        self.transpose_Y = True
-
-
-class TestMatMulOp5(TestMatMulOp):
-    """
-    case [M, K] x [K, N] = [M, N]
-    """
-
-    def config(self):
-        self.x_shape = (100, 2)
-        self.y_shape = (100, 2)
-        self.transpose_X = True
-        self.transpose_Y = False
-
-
-class TestMatMulOp6(TestMatMulOp):
-    """
-    case [B, M, K] x [K, N] =  [B, M, N]
-    """
-
-    def config(self):
-        self.x_shape = (2, 2, 25)
-        self.y_shape = (25, 4)
-        self.transpose_X = False
-        self.transpose_Y = False
-
-
-class TestMatMulOp7(TestMatMulOp):
-    """
-    case [B, M, K] x [K, N] =  [B, M, N]
-    """
-
-    def config(self):
-        self.x_shape = (1, 4, 25)
-        self.y_shape = (4, 25)
-        self.transpose_X = False
-        self.transpose_Y = True
-
-
-class TestMatMulOp8(TestMatMulOp):
-    """
-    case [B, M, K] x [K, N] =  [B, M, N]
-    """
-
-    def config(self):
-        self.x_shape = (1, 25, 4)
-        self.y_shape = (25, 4)
-        self.transpose_X = True
-        self.transpose_Y = False
-
-
-class TestMatMulOp9(TestMatMulOp):
-    """
-    case [B, M, K] x  [B, K, N] = [B, M, N]
-    """
-
-    def config(self):
-        self.x_shape = (2, 5, 10)
-        self.y_shape = (2, 10, 5)
-        self.transpose_X = False
-        self.transpose_Y = False
-
-
-class TestMatMulOp10(TestMatMulOp):
-    """
-    case [B, M, K] x  [B, K, N] = [B, M, N]
-    """
-
-    def config(self):
-        self.x_shape = (2, 10, 5)
-        self.y_shape = (2, 10, 5)
-        self.transpose_X = True
-        self.transpose_Y = False
-
-
-class TestMatMulOp11(TestMatMulOp):
-    """
-    case [B, M, K] x  [B, K, N] = [B, M, N]
-    """
-
-    def config(self):
-        self.x_shape = (2, 5, 10)
-        self.y_shape = (2, 5, 10)
-        self.transpose_X = False
-        self.transpose_Y = True
-
-
-class TestMatMulOp12(TestMatMulOp):
-    """
-    case to check the gradient for special case
-    """
-
-    def config(self):
-        self.x_shape = 100
-        self.y_shape = (1, 2, 2, 100, 2)
-        self.transpose_X = False
-        self.transpose_Y = False
-
-
-class TestMatMulOp13(TestMatMulOp):
-    """
-    case to check the gradient for special case
-    """
-
-    def config(self):
-        self.x_shape = (2, 1, 100)
-        self.y_shape = 100
-        self.transpose_X = False
-        self.transpose_Y = False
-
-
-# TODO(metax_gpu): alpha will be supported in next version
-# --------------------test matmul alpha--------------------
-# def create_test_alpha_class(parent):
-#     class TestMatMulOpAlphaCase(parent):
-#         def init_alpha(self):
-#             self.alpha = 0.125
-
-#     cls_name = "{0}_{1}".format(parent.__name__, "Alpha")
-#     TestMatMulOpAlphaCase.__name__ = cls_name
-#     globals()[cls_name] = TestMatMulOpAlphaCase
-
-# create_test_alpha_class(TestMatMulOp)
-# create_test_alpha_class(TestMatMulOp1)
-# create_test_alpha_class(TestMatMulOp2)
-# create_test_alpha_class(TestMatMulOp3)
-# create_test_alpha_class(TestMatMulOp4)
-# create_test_alpha_class(TestMatMulOp5)
-# create_test_alpha_class(TestMatMulOp6)
-# create_test_alpha_class(TestMatMulOp9)
-# create_test_alpha_class(TestMatMulOp10)
-# create_test_alpha_class(TestMatMulOp11)
-# create_test_alpha_class(TestMatMulOp12)
-# create_test_alpha_class(TestMatMulOp13)
-
-
-# --------------------test matmul fp16--------------------
-def create_test_fp16_class(parent, atol=0.001, max_relative_error=2.5):
-    class TestMatMulOpFp16Case(parent):
-        def init_kernel_type(self):
-            self.dtype = np.float16
-
-        def test_check_output(self):
-            self.check_output_with_place(self.place, atol=atol)
-
-        def test_check_grad(self):
-            self.check_grad_with_place(
-                self.place, ["X", "Y"], "Out", max_relative_error=max_relative_error
-            )
-
-    cls_name = "{0}_{1}".format(parent.__name__, "Fp16")
-    TestMatMulOpFp16Case.__name__ = cls_name
-    globals()[cls_name] = TestMatMulOpFp16Case
-
-
-create_test_fp16_class(TestMatMulOp)
-create_test_fp16_class(TestMatMulOp1)
-create_test_fp16_class(TestMatMulOp2)
-create_test_fp16_class(TestMatMulOp3)
-create_test_fp16_class(TestMatMulOp4)
-create_test_fp16_class(TestMatMulOp5)
-create_test_fp16_class(TestMatMulOp6)
-create_test_fp16_class(TestMatMulOp9)
-create_test_fp16_class(TestMatMulOp10)
-create_test_fp16_class(TestMatMulOp11)
-create_test_fp16_class(TestMatMulOp12)
-create_test_fp16_class(TestMatMulOp13)
-
-if __name__ == "__main__":
-    unittest.main()

From d113018e9befab1540aa21ee5d6f8261831e245d Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Tue, 23 Sep 2025 19:12:06 +0800
Subject: [PATCH 044/121] [metax]fix paddle bug" (#58)

* [metax]fix paddle bug
---
 backends/metax_gpu/CMakeLists.txt             |   2 -
 .../grid_sample_grad_kernel_register.cu       |  23 -
 .../grid_sample_kernel_register.cu            |  19 -
 .../grid_sample_grad_kernel_register.cu       | 839 ++++++++++++++++++
 .../grid_sample_kernel_register.cu            | 527 +++++++++++
 .../metax_kernel/weight_only_linear_kernel.cu |   3 +-
 6 files changed, 1368 insertions(+), 45 deletions(-)
 delete mode 100644 backends/metax_gpu/kernels/cuda_kernels/grid_sample_grad_kernel_register.cu
 delete mode 100644 backends/metax_gpu/kernels/cuda_kernels/grid_sample_kernel_register.cu
 create mode 100644 backends/metax_gpu/kernels/metax_kernel/grid_sample_grad_kernel_register.cu
 create mode 100644 backends/metax_gpu/kernels/metax_kernel/grid_sample_kernel_register.cu

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index b98f2bcc919..bca1ce7aad4 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -310,8 +310,6 @@ file(
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/hinge_loss_grad_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/hinge_loss_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/gru_grad_kernel.cu
-  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/grid_sample_grad_kernel.cu
-  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/grid_sample_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/generate_proposals_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/gaussian_inplace_grad_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/gammaln_kernel.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/grid_sample_grad_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/grid_sample_grad_kernel_register.cu
deleted file mode 100644
index 83c47dc86db..00000000000
--- a/backends/metax_gpu/kernels/cuda_kernels/grid_sample_grad_kernel_register.cu
+++ /dev/null
@@ -1,23 +0,0 @@
-// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/grid_sample_grad_kernel.h"
-
-PD_CUSTOM_KERNEL_REGISTER(grid_sample_grad,
-                          metax_gpu,
-                          ALL_LAYOUT,
-                          phi::GridSampleGradKernel,
-                          float,
-                          double) {}
diff --git a/backends/metax_gpu/kernels/cuda_kernels/grid_sample_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/grid_sample_kernel_register.cu
deleted file mode 100644
index a0447405971..00000000000
--- a/backends/metax_gpu/kernels/cuda_kernels/grid_sample_kernel_register.cu
+++ /dev/null
@@ -1,19 +0,0 @@
-// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/grid_sample_kernel.h"
-
-PD_CUSTOM_KERNEL_REGISTER(
-    grid_sample, metax_gpu, ALL_LAYOUT, phi::GridSampleKernel, float, double) {}
diff --git a/backends/metax_gpu/kernels/metax_kernel/grid_sample_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/grid_sample_grad_kernel_register.cu
new file mode 100644
index 00000000000..8aae95bdb22
--- /dev/null
+++ b/backends/metax_gpu/kernels/metax_kernel/grid_sample_grad_kernel_register.cu
@@ -0,0 +1,839 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "kernels/metax_kernel/metax_context.h"
+#include "paddle/phi/backends/dynload/cudnn.h"
+#include "paddle/phi/backends/gpu/gpu_device_function.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/gpu/grid_sample_utils.h"
+#include "paddle/phi/kernels/grid_sample_grad_kernel.h"
+
+namespace phi {
+
+template <typename T, typename IndexT>
+static __forceinline__ __device__ void AtomicAdd(T* data,
+                                                 IndexT h,
+                                                 IndexT w,
+                                                 IndexT sH,
+                                                 IndexT sW,
+                                                 IndexT H,
+                                                 IndexT W,
+                                                 T delta) {
+  if (InBounds(h, w, H, W)) {
+    phi::CudaAtomicAdd(data + h * sH + w * sW, delta);
+  }
+}
+
+template <typename T, typename IndexT>
+static __forceinline__ __device__ void AtomicAdd3D(T* data,
+                                                   IndexT d,
+                                                   IndexT h,
+                                                   IndexT w,
+                                                   IndexT sD,
+                                                   IndexT sH,
+                                                   IndexT sW,
+                                                   IndexT D,
+                                                   IndexT H,
+                                                   IndexT W,
+                                                   T delta) {
+  if (InBounds3D(d, h, w, D, H, W)) {
+    phi::CudaAtomicAdd(data + d * sD + h * sH + w * sW, delta);
+  }
+}
+
+template <typename T, typename IndexT>
+static __forceinline__ __device__ T
+UnnormalizeWithMask(T coord, IndexT size, bool align_corners, T* grad_in) {
+  if (align_corners) {
+    *grad_in = static_cast<T>(size - 1) / 2;
+    return ((coord + 1.f) / 2) * (size - 1);
+  } else {
+    *grad_in = static_cast<T>(size) / 2;
+    return ((coord + 1.f) * size - 1) / 2;
+  }
+}
+
+template <typename T, typename IndexT>
+static __forceinline__ __device__ T ClipIndexesWithMask(T in,
+                                                        IndexT clip_limit,
+                                                        T* grad_in) {
+  if (in <= static_cast<T>(0)) {
+    *grad_in = static_cast<T>(0);
+    return static_cast<T>(0);
+  } else {
+    T max = static_cast<T>(clip_limit - 1);
+    if (in >= max) {
+      *grad_in = static_cast<T>(0);
+      return max;
+    } else {
+      *grad_in = static_cast<T>(1);
+      return in;
+    }
+  }
+}
+
+template <typename T, typename IndexT>
+static __forceinline__ __device__ T
+ReflectIndexesWithMask(T in, IndexT twice_low, IndexT twice_high, T* grad_in) {
+  if (twice_low == twice_high) {
+    *grad_in = static_cast<T>(0);
+    return static_cast<T>(0);
+  }
+  IndexT grad_in_mult_;
+  T min = static_cast<T>(twice_low) / 2;
+  T span = static_cast<T>(twice_high - twice_low) / 2;
+  in = in - min;
+  if (in < static_cast<T>(0)) {
+    grad_in_mult_ = -1;
+    in = -in;
+  } else {
+    grad_in_mult_ = 1;
+  }
+  T extra = fmod(in, span);
+  IndexT flips = static_cast<IndexT>(floor(in / span));
+  if (flips % 2 == 0) {
+    *grad_in = static_cast<T>(grad_in_mult_);
+    return extra + min;
+  } else {
+    *grad_in = static_cast<T>(-grad_in_mult_);
+    return span - extra + min;
+  }
+}
+
+template <typename T, typename IndexT>
+static __forceinline__ __device__ T
+ComputePositionsWithMask(T coord,
+                         IndexT size,
+                         PaddingMode padding_mode,
+                         bool align_corners,
+                         T* grad_in) {
+  T grad_clip, grad_refl;
+  coord = UnnormalizeWithMask<T>(coord, size, align_corners, grad_in);
+  if (padding_mode == PaddingMode::border) {
+    coord = ClipIndexesWithMask(coord, size, &grad_clip);
+    *grad_in = (*grad_in) * grad_clip;
+  } else if (padding_mode == PaddingMode::reflect) {
+    coord = align_corners ? ReflectIndexesWithMask<T, IndexT>(
+                                coord, 0, 2 * (size - 1), &grad_refl)
+                          : ReflectIndexesWithMask<T, IndexT>(
+                                coord, -1, 2 * size - 1, &grad_refl);
+    coord = ClipIndexesWithMask(coord, size, &grad_clip);
+    *grad_in = (*grad_in) * grad_refl * grad_clip;
+  }
+  return SafeDownGradeToIntRange(coord);
+}
+
+template <typename T, typename IndexT>
+__global__ void GridSamplerCudaBackwardKernel(const IndexT nthreads,
+                                              const T* grad_output,
+                                              const T* input,
+                                              const T* grid,
+                                              IndexT n,
+                                              IndexT out_c,
+                                              IndexT out_h,
+                                              IndexT out_w,
+                                              IndexT in_h,
+                                              IndexT in_w,
+                                              T* grad_input,
+                                              T* grad_grid,
+                                              const Mode mode,
+                                              const PaddingMode padding_mode,
+                                              bool align_corners) {
+  IndexT inp_sN = out_c * in_h * in_w;
+  IndexT inp_sC = in_h * in_w;
+  IndexT inp_sH = in_w;
+  IndexT inp_sW = 1;
+  IndexT grid_sN = out_h * out_w * 2;
+  IndexT grid_sH = out_w * 2;
+  IndexT grid_sW = 2;
+  IndexT grid_sCoor = 1;
+
+  IndexT gOut_sN = out_c * out_h * out_w;
+  IndexT gOut_sC = out_h * out_w;
+  IndexT gOut_sH = out_w;
+  IndexT gOut_sW = 1;
+
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const IndexT w = index % out_w;
+    const IndexT h = (index / out_w) % out_h;
+    const IndexT n = index / (out_h * out_w);
+    const IndexT grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
+
+    T ix = grid[grid_offset];
+    T iy = grid[grid_offset + grid_sCoor];
+
+    T gix_mult, giy_mult;
+    ix = ComputePositionsWithMask<T, IndexT>(
+        ix, in_w, padding_mode, align_corners, &gix_mult);
+    iy = ComputePositionsWithMask<T, IndexT>(
+        iy, in_h, padding_mode, align_corners, &giy_mult);
+
+    if (mode == Mode::bilinear) {
+      IndexT ix_nw = static_cast<IndexT>(floor(ix));
+      IndexT iy_nw = static_cast<IndexT>(floor(iy));
+      IndexT ix_ne = ix_nw + 1;
+      IndexT iy_ne = iy_nw;
+      IndexT ix_sw = ix_nw;
+      IndexT iy_sw = iy_nw + 1;
+      IndexT ix_se = ix_nw + 1;
+      IndexT iy_se = iy_nw + 1;
+
+      T nw = (ix_se - ix) * (iy_se - iy);
+      T ne = (ix - ix_sw) * (iy_sw - iy);
+      T sw = (ix_ne - ix) * (iy - iy_ne);
+      T se = (ix - ix_nw) * (iy - iy_nw);
+
+      T gix = static_cast<T>(0), giy = static_cast<T>(0);
+      IndexT gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW;
+      T* gInp_ptr_NC = grad_input + n * inp_sN;
+      IndexT inp_offset_NC = n * inp_sN;
+      for (IndexT c = 0; c < out_c; ++c,
+                  inp_offset_NC += inp_sC,
+                  gInp_ptr_NC += inp_sC,
+                  gOut_offset += gOut_sC) {
+        T gOut = grad_output[gOut_offset];
+
+        AtomicAdd(
+            gInp_ptr_NC, iy_nw, ix_nw, inp_sH, inp_sW, in_h, in_w, nw * gOut);
+        AtomicAdd(
+            gInp_ptr_NC, iy_ne, ix_ne, inp_sH, inp_sW, in_h, in_w, ne * gOut);
+        AtomicAdd(
+            gInp_ptr_NC, iy_sw, ix_sw, inp_sH, inp_sW, in_h, in_w, sw * gOut);
+        AtomicAdd(
+            gInp_ptr_NC, iy_se, ix_se, inp_sH, inp_sW, in_h, in_w, se * gOut);
+
+        if (InBounds(iy_nw, ix_nw, in_h, in_w)) {
+          T nw_val = input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW];
+          gix -= nw_val * (iy_se - iy) * gOut;
+          giy -= nw_val * (ix_se - ix) * gOut;
+        }
+        if (InBounds(iy_ne, ix_ne, in_h, in_w)) {
+          T ne_val = input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW];
+          gix += ne_val * (iy_sw - iy) * gOut;
+          giy -= ne_val * (ix - ix_sw) * gOut;
+        }
+        if (InBounds(iy_sw, ix_sw, in_h, in_w)) {
+          T sw_val = input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW];
+          gix -= sw_val * (iy - iy_ne) * gOut;
+          giy += sw_val * (ix_ne - ix) * gOut;
+        }
+        if (InBounds(iy_se, ix_se, in_h, in_w)) {
+          T se_val = input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW];
+          gix += se_val * (iy - iy_nw) * gOut;
+          giy += se_val * (ix - ix_nw) * gOut;
+        }
+      }
+
+      if (grad_grid != nullptr) {
+        T* gGrid_ptr_NHW = grad_grid + index * grid_sW;
+        gGrid_ptr_NHW[0] = gix_mult * gix;
+        gGrid_ptr_NHW[1] = giy_mult * giy;
+      }
+    } else if (mode == Mode::nearest) {
+      IndexT ix_nearest = static_cast<IndexT>(std::nearbyint(ix));
+      IndexT iy_nearest = static_cast<IndexT>(std::nearbyint(iy));
+
+      IndexT gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW;
+      T* gInp_ptr_NC = grad_input + n * inp_sN;
+      for (IndexT c = 0; c < out_c;
+           ++c, gInp_ptr_NC += inp_sC, gOut_offset += gOut_sC) {
+        AtomicAdd(gInp_ptr_NC,
+                  iy_nearest,
+                  ix_nearest,
+                  inp_sH,
+                  inp_sW,
+                  in_h,
+                  in_w,
+                  grad_output[gOut_offset]);
+      }
+
+      if (grad_grid != nullptr) {
+        T* gGrid_ptr_NHW = grad_grid + index * grid_sW;
+        gGrid_ptr_NHW[0] = static_cast<T>(0);
+        gGrid_ptr_NHW[1] = static_cast<T>(0);
+      }
+    }
+  }
+}
+
+template <typename T, typename IndexT>
+__global__ void GridSampler3DCudaBackwardKernel(const IndexT nthreads,
+                                                const T* grad_output,
+                                                const T* input,
+                                                const T* grid,
+                                                IndexT out_c,
+                                                IndexT out_d,
+                                                IndexT out_h,
+                                                IndexT out_w,
+                                                IndexT in_d,
+                                                IndexT in_h,
+                                                IndexT in_w,
+                                                T* grad_input,
+                                                T* grad_grid,
+                                                const Mode mode,
+                                                const PaddingMode padding_mode,
+                                                bool align_corners) {
+  IndexT inp_sW = 1;
+  IndexT inp_sH = in_w;
+  IndexT inp_sD = in_h * in_w;
+  IndexT inp_sC = in_d * inp_sD;
+  IndexT inp_sN = out_c * inp_sC;
+
+  IndexT grid_sCoor = 1;
+  IndexT grid_sW = 3;
+  IndexT grid_sH = out_w * grid_sW;
+  IndexT grid_sD = out_h * grid_sH;
+  IndexT grid_sN = out_d * grid_sD;
+
+  IndexT gOut_sW = 1;
+  IndexT gOut_sH = out_w;
+  IndexT gOut_sD = out_h * out_w;
+  IndexT gOut_sC = out_d * gOut_sD;
+  IndexT gOut_sN = out_c * gOut_sC;
+
+  CUDA_KERNEL_LOOP_TYPE(index, nthreads, IndexT) {
+    const IndexT w = index % out_w;
+    const IndexT h = (index / out_w) % out_h;
+    const IndexT d = (index / (out_h * out_w)) % out_d;
+    const IndexT n = index / (out_d * out_h * out_w);
+    const auto grid_offset =
+        n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW;
+
+    // get the corresponding input x, y, z coordinates from grid
+    T ix = grid[grid_offset];
+    T iy = grid[grid_offset + grid_sCoor];
+    T iz = grid[grid_offset + 2 * grid_sCoor];
+
+    // multipliers for gradients on ix, iy, and iz
+    T gix_mult, giy_mult, giz_mult;
+    ix = ComputePositionsWithMask(
+        ix, in_w, padding_mode, align_corners, &gix_mult);
+    iy = ComputePositionsWithMask(
+        iy, in_h, padding_mode, align_corners, &giy_mult);
+    iz = ComputePositionsWithMask(
+        iz, in_d, padding_mode, align_corners, &giz_mult);
+
+    if (mode == Mode::bilinear) {
+      // get corner pixel values from (x, y, z)
+      // for 4d, we used north-east-south-west
+      // for 5d, we add top-bottom
+      IndexT ix_tnw = static_cast<IndexT>(std::floor(ix));
+      IndexT iy_tnw = static_cast<IndexT>(std::floor(iy));
+      IndexT iz_tnw = static_cast<IndexT>(std::floor(iz));
+
+      IndexT ix_tne = ix_tnw + 1;
+      IndexT iy_tne = iy_tnw;
+      IndexT iz_tne = iz_tnw;
+
+      IndexT ix_tsw = ix_tnw;
+      IndexT iy_tsw = iy_tnw + 1;
+      IndexT iz_tsw = iz_tnw;
+
+      IndexT ix_tse = ix_tnw + 1;
+      IndexT iy_tse = iy_tnw + 1;
+      IndexT iz_tse = iz_tnw;
+
+      IndexT ix_bnw = ix_tnw;
+      IndexT iy_bnw = iy_tnw;
+      IndexT iz_bnw = iz_tnw + 1;
+
+      IndexT ix_bne = ix_tnw + 1;
+      IndexT iy_bne = iy_tnw;
+      IndexT iz_bne = iz_tnw + 1;
+
+      IndexT ix_bsw = ix_tnw;
+      IndexT iy_bsw = iy_tnw + 1;
+      IndexT iz_bsw = iz_tnw + 1;
+
+      IndexT ix_bse = ix_tnw + 1;
+      IndexT iy_bse = iy_tnw + 1;
+      IndexT iz_bse = iz_tnw + 1;
+
+      // get surfaces to each neighbor:
+      T tnw = (ix_bse - ix) * (iy_bse - iy) * (iz_bse - iz);
+      T tne = (ix - ix_bsw) * (iy_bsw - iy) * (iz_bsw - iz);
+      T tsw = (ix_bne - ix) * (iy - iy_bne) * (iz_bne - iz);
+      T tse = (ix - ix_bnw) * (iy - iy_bnw) * (iz_bnw - iz);
+      T bnw = (ix_tse - ix) * (iy_tse - iy) * (iz - iz_tse);
+      T bne = (ix - ix_tsw) * (iy_tsw - iy) * (iz - iz_tsw);
+      T bsw = (ix_tne - ix) * (iy - iy_tne) * (iz - iz_tne);
+      T bse = (ix - ix_tnw) * (iy - iy_tnw) * (iz - iz_tnw);
+
+      T gix = static_cast<T>(0), giy = static_cast<T>(0),
+        giz = static_cast<T>(0);
+      IndexT gOut_offset =
+          n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW;
+      IndexT inp_offset_NC = n * inp_sN;
+      T* gInp_ptr_NC = grad_input + n * inp_sN;
+      for (IndexT c = 0; c < out_c; ++c,
+                  gOut_offset += gOut_sC,
+                  gInp_ptr_NC += inp_sC,
+                  inp_offset_NC += inp_sC) {
+        T gOut = grad_output[gOut_offset];
+
+        AtomicAdd3D(gInp_ptr_NC,
+                    iz_tnw,
+                    iy_tnw,
+                    ix_tnw,
+                    inp_sD,
+                    inp_sH,
+                    inp_sW,
+                    in_d,
+                    in_h,
+                    in_w,
+                    tnw * gOut);
+        AtomicAdd3D(gInp_ptr_NC,
+                    iz_tne,
+                    iy_tne,
+                    ix_tne,
+                    inp_sD,
+                    inp_sH,
+                    inp_sW,
+                    in_d,
+                    in_h,
+                    in_w,
+                    tne * gOut);
+        AtomicAdd3D(gInp_ptr_NC,
+                    iz_tsw,
+                    iy_tsw,
+                    ix_tsw,
+                    inp_sD,
+                    inp_sH,
+                    inp_sW,
+                    in_d,
+                    in_h,
+                    in_w,
+                    tsw * gOut);
+        AtomicAdd3D(gInp_ptr_NC,
+                    iz_tse,
+                    iy_tse,
+                    ix_tse,
+                    inp_sD,
+                    inp_sH,
+                    inp_sW,
+                    in_d,
+                    in_h,
+                    in_w,
+                    tse * gOut);
+        AtomicAdd3D(gInp_ptr_NC,
+                    iz_bnw,
+                    iy_bnw,
+                    ix_bnw,
+                    inp_sD,
+                    inp_sH,
+                    inp_sW,
+                    in_d,
+                    in_h,
+                    in_w,
+                    bnw * gOut);
+        AtomicAdd3D(gInp_ptr_NC,
+                    iz_bne,
+                    iy_bne,
+                    ix_bne,
+                    inp_sD,
+                    inp_sH,
+                    inp_sW,
+                    in_d,
+                    in_h,
+                    in_w,
+                    bne * gOut);
+        AtomicAdd3D(gInp_ptr_NC,
+                    iz_bsw,
+                    iy_bsw,
+                    ix_bsw,
+                    inp_sD,
+                    inp_sH,
+                    inp_sW,
+                    in_d,
+                    in_h,
+                    in_w,
+                    bsw * gOut);
+        AtomicAdd3D(gInp_ptr_NC,
+                    iz_bse,
+                    iy_bse,
+                    ix_bse,
+                    inp_sD,
+                    inp_sH,
+                    inp_sW,
+                    in_d,
+                    in_h,
+                    in_w,
+                    bse * gOut);
+
+        // calculate grad_grid
+        if (InBounds3D(iz_tnw, iy_tnw, ix_tnw, in_d, in_h, in_w)) {
+          T tnw_val = input[inp_offset_NC + iz_tnw * inp_sD + iy_tnw * inp_sH +
+                            ix_tnw * inp_sW];
+          gix -= tnw_val * (iy_bse - iy) * (iz_bse - iz) * gOut;
+          giy -= tnw_val * (ix_bse - ix) * (iz_bse - iz) * gOut;
+          giz -= tnw_val * (ix_bse - ix) * (iy_bse - iy) * gOut;
+        }
+        if (InBounds3D(iz_tne, iy_tne, ix_tne, in_d, in_h, in_w)) {
+          T tne_val = input[inp_offset_NC + iz_tne * inp_sD + iy_tne * inp_sH +
+                            ix_tne * inp_sW];
+          gix += tne_val * (iy_bsw - iy) * (iz_bsw - iz) * gOut;
+          giy -= tne_val * (ix - ix_bsw) * (iz_bsw - iz) * gOut;
+          giz -= tne_val * (ix - ix_bsw) * (iy_bsw - iy) * gOut;
+        }
+        if (InBounds3D(iz_tsw, iy_tsw, ix_tsw, in_d, in_h, in_w)) {
+          T tsw_val = input[inp_offset_NC + iz_tsw * inp_sD + iy_tsw * inp_sH +
+                            ix_tsw * inp_sW];
+          gix -= tsw_val * (iy - iy_bne) * (iz_bne - iz) * gOut;
+          giy += tsw_val * (ix_bne - ix) * (iz_bne - iz) * gOut;
+          giz -= tsw_val * (ix_bne - ix) * (iy - iy_bne) * gOut;
+        }
+        if (InBounds3D(iz_tse, iy_tse, ix_tse, in_d, in_h, in_w)) {
+          T tse_val = input[inp_offset_NC + iz_tse * inp_sD + iy_tse * inp_sH +
+                            ix_tse * inp_sW];
+          gix += tse_val * (iy - iy_bnw) * (iz_bnw - iz) * gOut;
+          giy += tse_val * (ix - ix_bnw) * (iz_bnw - iz) * gOut;
+          giz -= tse_val * (ix - ix_bnw) * (iy - iy_bnw) * gOut;
+        }
+        if (InBounds3D(iz_bnw, iy_bnw, ix_bnw, in_d, in_h, in_w)) {
+          T bnw_val = input[inp_offset_NC + iz_bnw * inp_sD + iy_bnw * inp_sH +
+                            ix_bnw * inp_sW];
+          gix -= bnw_val * (iy_tse - iy) * (iz - iz_tse) * gOut;
+          giy -= bnw_val * (ix_tse - ix) * (iz - iz_tse) * gOut;
+          giz += bnw_val * (ix_tse - ix) * (iy_tse - iy) * gOut;
+        }
+        if (InBounds3D(iz_bne, iy_bne, ix_bne, in_d, in_h, in_w)) {
+          T bne_val = input[inp_offset_NC + iz_bne * inp_sD + iy_bne * inp_sH +
+                            ix_bne * inp_sW];
+          gix += bne_val * (iy_tsw - iy) * (iz - iz_tsw) * gOut;
+          giy -= bne_val * (ix - ix_tsw) * (iz - iz_tsw) * gOut;
+          giz += bne_val * (ix - ix_tsw) * (iy_tsw - iy) * gOut;
+        }
+        if (InBounds3D(iz_bsw, iy_bsw, ix_bsw, in_d, in_h, in_w)) {
+          T bsw_val = input[inp_offset_NC + iz_bsw * inp_sD + iy_bsw * inp_sH +
+                            ix_bsw * inp_sW];
+          gix -= bsw_val * (iy - iy_tne) * (iz - iz_tne) * gOut;
+          giy += bsw_val * (ix_tne - ix) * (iz - iz_tne) * gOut;
+          giz += bsw_val * (ix_tne - ix) * (iy - iy_tne) * gOut;
+        }
+        if (InBounds3D(iz_bse, iy_bse, ix_bse, in_d, in_h, in_w)) {
+          T bse_val = input[inp_offset_NC + iz_bse * inp_sD + iy_bse * inp_sH +
+                            ix_bse * inp_sW];
+          gix += bse_val * (iy - iy_tnw) * (iz - iz_tnw) * gOut;
+          giy += bse_val * (ix - ix_tnw) * (iz - iz_tnw) * gOut;
+          giz += bse_val * (ix - ix_tnw) * (iy - iy_tnw) * gOut;
+        }
+      }
+      if (grad_grid != nullptr) {
+        T* gGrid_ptr_NDHW = grad_grid + index * grid_sW;
+        gGrid_ptr_NDHW[0] = gix_mult * gix;
+        gGrid_ptr_NDHW[1] = giy_mult * giy;
+        gGrid_ptr_NDHW[2] = giz_mult * giz;
+      }
+    } else if (mode == Mode::nearest) {
+      IndexT ix_nearest = static_cast<IndexT>(std::round(ix));
+      IndexT iy_nearest = static_cast<IndexT>(std::round(iy));
+      IndexT iz_nearest = static_cast<IndexT>(std::round(iz));
+
+      // assign nearest neighbor pixel value to output pixel
+      IndexT gOut_offset =
+          n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW;
+      T* gInp_ptr_NC = grad_input + n * inp_sN;
+      for (IndexT c = 0; c < out_c;
+           ++c, gOut_offset += gOut_sC, gInp_ptr_NC += inp_sC) {
+        AtomicAdd3D(gInp_ptr_NC,
+                    iz_nearest,
+                    iy_nearest,
+                    ix_nearest,
+                    inp_sD,
+                    inp_sH,
+                    inp_sW,
+                    in_d,
+                    in_h,
+                    in_w,
+                    grad_output[gOut_offset]);
+      }
+      if (grad_grid != nullptr) {
+        T* gGrid_ptr_NDHW = grad_grid + index * grid_sW;
+        gGrid_ptr_NDHW[0] = static_cast<T>(0);
+        gGrid_ptr_NDHW[1] = static_cast<T>(0);
+        gGrid_ptr_NDHW[2] = static_cast<T>(0);
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void GridSampleGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& grid,
+                          const DenseTensor& out_grad,
+                          const std::string& mode,
+                          const std::string& padding_mode,
+                          bool align_corners,
+                          DenseTensor* x_grad,
+                          DenseTensor* grid_grad) {
+  if (out_grad.numel() == 0) {
+    if (x_grad) {
+      phi::Full<T, Context>(
+          dev_ctx, phi::IntArray(common::vectorize(x_grad->dims())), 0, x_grad);
+    }
+    if (grid_grad) {
+      phi::Full<T, Context>(dev_ctx,
+                            phi::IntArray(common::vectorize(grid_grad->dims())),
+                            0,
+                            grid_grad);
+    }
+    return;
+  }
+
+  PaddingMode enum_padding_mode;
+  Mode enum_mode;
+  if (padding_mode == "border") {
+    enum_padding_mode = PaddingMode::border;
+  } else if (padding_mode == "reflection") {
+    enum_padding_mode = PaddingMode::reflect;
+  } else {
+    enum_padding_mode = PaddingMode::zeros;
+  }
+
+  if (mode == "nearest") {
+    enum_mode = Mode::nearest;
+  } else {
+    enum_mode = Mode::bilinear;
+  }
+
+#ifndef PADDLE_WITH_HIP
+  if (condCudnnGridSampler<T>(x, grid) &&
+      enum_padding_mode == PaddingMode::zeros && enum_mode == Mode::bilinear &&
+      align_corners) {
+    const int64_t N = x.dims()[0];
+    const int64_t C = x.dims()[1];
+    const int64_t H_in = x.dims()[2];
+    const int64_t W_in = x.dims()[3];
+    const int64_t H_out = grid.dims()[1];
+    const int64_t W_out = grid.dims()[2];
+
+    // cuDNN handle
+    cudnnHandle_t handle = GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace());
+
+    // Create and set Tensor descriptors (NCHW) for x/y
+    cudnnTensorDescriptor_t x_desc, dx_desc, y_desc;
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnCreateTensorDescriptor(&x_desc));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnCreateTensorDescriptor(&dx_desc));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnCreateTensorDescriptor(&y_desc));
+
+    const cudnnDataType_t cudnn_dtype =
+        std::is_same<T, float>::value ? CUDNN_DATA_FLOAT : CUDNN_DATA_DOUBLE;
+
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnSetTensor4dDescriptor(x_desc,
+                                                 CUDNN_TENSOR_NCHW,
+                                                 cudnn_dtype,
+                                                 static_cast<int>(N),
+                                                 static_cast<int>(C),
+                                                 static_cast<int>(H_in),
+                                                 static_cast<int>(W_in)));
+
+    // The shape of dx is consistent with that of x
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnSetTensor4dDescriptor(dx_desc,
+                                                 CUDNN_TENSOR_NCHW,
+                                                 cudnn_dtype,
+                                                 static_cast<int>(N),
+                                                 static_cast<int>(C),
+                                                 static_cast<int>(H_in),
+                                                 static_cast<int>(W_in)));
+
+    // The shape of y is consistent with out_grad
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnSetTensor4dDescriptor(y_desc,
+                                                 CUDNN_TENSOR_NCHW,
+                                                 cudnn_dtype,
+                                                 static_cast<int>(N),
+                                                 static_cast<int>(C),
+                                                 static_cast<int>(H_out),
+                                                 static_cast<int>(W_out)));
+
+    // Spatial Transformer descriptor: specifies sampler type and output
+    // dimension (N, C, H_out, W_out)
+    cudnnSpatialTransformerDescriptor_t st_desc;
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnCreateSpatialTransformerDescriptor(&st_desc));
+    int st_dims[4] = {static_cast<int>(N),
+                      static_cast<int>(C),
+                      static_cast<int>(H_out),
+                      static_cast<int>(W_out)};
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnSetSpatialTransformerNdDescriptor(
+            st_desc, CUDNN_SAMPLER_BILINEAR, cudnn_dtype, 4, st_dims));
+
+    // data pointer
+    const T* x_data = x.data<T>();
+    const T* grid_data = grid.data<T>();
+    const T* dy_data = out_grad.data<T>();
+
+    T* dx_data = dev_ctx.template Alloc<T>(x_grad);
+    phi::funcs::SetConstant<Context, T>()(dev_ctx, x_grad, static_cast<T>(0));
+
+    T* dgrid_data = nullptr;
+    if (grid_grad) {
+      dgrid_data = dev_ctx.template Alloc<T>(grid_grad);
+    }
+
+    // alpha/beta
+    using AlphaBetaT = typename std::
+        conditional<std::is_same<T, float>::value, float, double>::type;
+    const AlphaBetaT one = static_cast<AlphaBetaT>(1.0);
+    const AlphaBetaT zero = static_cast<AlphaBetaT>(0.0);
+
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSpatialTfSamplerBackward(
+        handle,
+        st_desc,
+        static_cast<const void*>(&one),  // alpha (for dx)
+        x_desc,
+        static_cast<const void*>(x_data),
+        static_cast<const void*>(&zero),  // beta (for dx)
+        dx_desc,
+        static_cast<void*>(dx_data),
+        static_cast<const void*>(&one),  // alpha (for dgrid)
+        y_desc,
+        static_cast<const void*>(dy_data),
+        static_cast<const void*>(grid_data),
+        static_cast<const void*>(&zero),  // beta (for dgrid)
+        static_cast<void*>(dgrid_data)));
+
+    // resource release
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnDestroySpatialTransformerDescriptor(st_desc));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnDestroyTensorDescriptor(x_desc));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnDestroyTensorDescriptor(dx_desc));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnDestroyTensorDescriptor(y_desc));
+    return;
+  }
+#endif
+
+  bool use_int32_index = x.numel() <= std::numeric_limits<int>::max() &&
+                         grid.numel() <= std::numeric_limits<int>::max() &&
+                         out_grad.numel() <= std::numeric_limits<int>::max();
+
+  if (x.dims().size() == 4) {
+    const int64_t n = grid.dims()[0];
+    const int64_t out_h = grid.dims()[1];
+    const int64_t out_w = grid.dims()[2];
+    const int64_t c = x.dims()[1];
+    const int64_t in_h = x.dims()[2];
+    const int64_t in_w = x.dims()[3];
+
+    dev_ctx.template Alloc<T>(x_grad);
+    phi::funcs::SetConstant<Context, T>()(dev_ctx, x_grad, static_cast<T>(0));
+
+    T* grid_grad_data = nullptr;
+    if (grid_grad != nullptr) {
+      grid_grad_data = dev_ctx.template Alloc<T>(grid_grad);
+    }
+
+    int64_t count = n * out_h * out_w;
+    auto cu_stream = dev_ctx.stream();
+    backends::gpu::GpuLaunchConfig config =
+        backends::gpu::GetGpuLaunchConfig1D(dev_ctx, count);
+
+#define LAUNCH_KERNEL(INDEX_TYPE)                                         \
+  GridSamplerCudaBackwardKernel<T, INDEX_TYPE>                            \
+      <<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>( \
+          count,                                                          \
+          out_grad.data<T>(),                                             \
+          x.data<T>(),                                                    \
+          grid.data<T>(),                                                 \
+          n,                                                              \
+          c,                                                              \
+          out_h,                                                          \
+          out_w,                                                          \
+          in_h,                                                           \
+          in_w,                                                           \
+          x_grad->data<T>(),                                              \
+          grid_grad_data,                                                 \
+          enum_mode,                                                      \
+          enum_padding_mode,                                              \
+          align_corners);
+    if (use_int32_index) {
+      LAUNCH_KERNEL(int32_t)
+    } else {
+      LAUNCH_KERNEL(int64_t)
+    }
+#undef LAUNCH_KERNEL
+  } else {
+    const int64_t out_d = grid.dims()[1];
+    const int64_t out_h = grid.dims()[2];
+    const int64_t out_w = grid.dims()[3];
+    const int64_t n = x.dims()[0];
+    const int64_t c = x.dims()[1];
+    const int64_t in_d = x.dims()[2];
+    const int64_t in_h = x.dims()[3];
+    const int64_t in_w = x.dims()[4];
+
+    dev_ctx.template Alloc<T>(x_grad);
+    phi::funcs::SetConstant<Context, T>()(dev_ctx, x_grad, static_cast<T>(0));
+
+    T* grid_grad_data = nullptr;
+    if (grid_grad != nullptr) {
+      grid_grad_data = dev_ctx.template Alloc<T>(grid_grad);
+    }
+
+    int64_t count = static_cast<int64_t>(n * out_d * out_h * out_w);
+    auto cu_stream = dev_ctx.stream();
+    backends::gpu::GpuLaunchConfig config =
+        backends::gpu::GetGpuLaunchConfig1D(dev_ctx, count);
+
+#define LAUNCH_KERNEL(INDEX_TYPE)                                         \
+  GridSampler3DCudaBackwardKernel<T, INDEX_TYPE>                          \
+      <<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>( \
+          count,                                                          \
+          out_grad.data<T>(),                                             \
+          x.data<T>(),                                                    \
+          grid.data<T>(),                                                 \
+          c,                                                              \
+          out_d,                                                          \
+          out_h,                                                          \
+          out_w,                                                          \
+          in_d,                                                           \
+          in_h,                                                           \
+          in_w,                                                           \
+          x_grad->data<T>(),                                              \
+          grid_grad_data,                                                 \
+          enum_mode,                                                      \
+          enum_padding_mode,                                              \
+          align_corners);
+    if (use_int32_index) {
+      LAUNCH_KERNEL(int32_t)
+    } else {
+      LAUNCH_KERNEL(int64_t)
+    }
+#undef LAUNCH_KERNEL
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_PLUGIN_KERNEL(grid_sample_grad,
+                          metax_gpus,
+                          ALL_LAYOUT,
+                          phi::GridSampleGradKernel,
+                          float,
+                          double) {}
diff --git a/backends/metax_gpu/kernels/metax_kernel/grid_sample_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/grid_sample_kernel_register.cu
new file mode 100644
index 00000000000..71050c264c6
--- /dev/null
+++ b/backends/metax_gpu/kernels/metax_kernel/grid_sample_kernel_register.cu
@@ -0,0 +1,527 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "glog/logging.h"
+#include "kernels/metax_kernel/metax_context.h"
+#include "paddle/phi/backends/dynload/cudnn.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/grid_sample_utils.h"
+#include "paddle/phi/kernels/grid_sample_kernel.h"
+
+namespace phi {
+
+template <typename T, typename IndexT>
+static __forceinline__ __device__ T Unnormalize(T coord,
+                                                IndexT size,
+                                                bool align_corners) {
+  return align_corners ? ((coord + 1.f) / 2) * (size - 1)
+                       : ((coord + 1.f) * size - 1) / 2;
+}
+
+template <typename T, typename IndexT>
+static __forceinline__ __device__ T ClipIndexes(T in, IndexT max_value) {
+  return min(static_cast<T>(max_value - 1), max(in, static_cast<T>(0)));
+}
+
+template <typename T, typename IndexT>
+static __forceinline__ __device__ T ReflectIndexes(T in,
+                                                   IndexT twice_low,
+                                                   IndexT twice_high) {
+  if (twice_low == twice_high) {
+    return static_cast<T>(0);
+  }
+  T min = static_cast<T>(twice_low) / 2;
+  T span = static_cast<T>(twice_high - twice_low) / 2;
+  in = fabs(in - min);
+  T extra = fmod(in, span);
+  IndexT flips = floor(in / span);
+  return (flips & 1) ? span - extra + min : extra + min;  // cond ? odd : even
+}
+
+template <typename T, typename IndexT>
+static __forceinline__ __device__ T ComputePositions(T coord,
+                                                     IndexT size,
+                                                     PaddingMode padding_mode,
+                                                     bool align_corners) {
+  coord = Unnormalize(coord, size, align_corners);
+  if (padding_mode == PaddingMode::border) {
+    coord = ClipIndexes(coord, size);
+  } else if (padding_mode == PaddingMode::reflect) {
+    coord = align_corners ? ReflectIndexes<T, IndexT>(coord, 0, 2 * (size - 1))
+                          : ReflectIndexes<T, IndexT>(coord, -1, 2 * size - 1);
+    coord = ClipIndexes(coord, size);
+  }
+  return SafeDownGradeToIntRange(coord);
+}
+
+template <typename T, typename IndexT>
+__global__ void GridSampleCudaKernel(IndexT n,
+                                     IndexT out_c,
+                                     IndexT out_hw,
+                                     IndexT in_h,
+                                     IndexT in_w,
+                                     const T* __restrict__ input,
+                                     const T* __restrict__ grid,
+                                     T* __restrict__ output,
+                                     const Mode mode,
+                                     const PaddingMode padding_mode,
+                                     bool align_corners) {
+  IndexT nthreads = n * out_hw;
+  IndexT inp_sN = out_c * (in_h * in_w);
+  IndexT inp_sC = in_h * in_w;
+  IndexT inp_sH = in_w;
+  IndexT inp_sW = 1;
+  IndexT grid_sNHW = 2;
+  IndexT grid_sCoor = 1;
+  IndexT out_sN = out_c * out_hw;
+  IndexT out_sC = out_hw;
+  IndexT out_sHW = 1;
+  CUDA_KERNEL_LOOP_TYPE(index, nthreads, IndexT) {
+    const IndexT hw = index % out_hw;
+    const IndexT n = index / out_hw;
+    const IndexT grid_offset = index * grid_sNHW;
+
+    T ix = grid[grid_offset];
+    T iy = grid[grid_offset + grid_sCoor];
+
+    ix = ComputePositions(ix, in_w, padding_mode, align_corners);
+    iy = ComputePositions(iy, in_h, padding_mode, align_corners);
+    if (mode == Mode::bilinear) {
+      IndexT ix_nw = floor(ix);
+      IndexT iy_nw = floor(iy);
+      IndexT ix_ne = ix_nw + 1;
+      IndexT iy_ne = iy_nw;
+      IndexT ix_sw = ix_nw;
+      IndexT iy_sw = iy_nw + 1;
+      IndexT ix_se = ix_nw + 1;
+      IndexT iy_se = iy_nw + 1;
+
+      T nw = (ix_se - ix) * (iy_se - iy);
+      T ne = (ix - ix_sw) * (iy_sw - iy);
+      T sw = (ix_ne - ix) * (iy - iy_ne);
+      T se = (ix - ix_nw) * (iy - iy_nw);
+
+      IndexT inp_offset_NC = n * inp_sN;
+      T* out_ptr_NCHW = output + (n * out_sN + hw * out_sHW);
+
+      for (IndexT c = 0; c < out_c;
+           ++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) {
+        T value{0};
+        if (InBounds(iy_nw, ix_nw, in_h, in_w)) {
+          value += input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW] * nw;
+        }
+        if (InBounds(iy_ne, ix_ne, in_h, in_w)) {
+          value += input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW] * ne;
+        }
+        if (InBounds(iy_sw, ix_sw, in_h, in_w)) {
+          value += input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW] * sw;
+        }
+        if (InBounds(iy_se, ix_se, in_h, in_w)) {
+          value += input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW] * se;
+        }
+        *out_ptr_NCHW = value;
+      }
+    } else if (mode == Mode::nearest) {
+      IndexT ix_nearest = std::nearbyint(ix);
+      IndexT iy_nearest = std::nearbyint(iy);
+      IndexT inp_offset_NC = n * inp_sN;
+      T* out_ptr_NCHW = output + (n * out_sN + hw * out_sHW);
+      for (IndexT c = 0; c < out_c;
+           ++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) {
+        if (InBounds(iy_nearest, ix_nearest, in_h, in_w)) {
+          *out_ptr_NCHW =
+              input[inp_offset_NC + iy_nearest * inp_sH + ix_nearest * inp_sW];
+        } else {
+          *out_ptr_NCHW = static_cast<T>(0);
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename IndexT>
+__global__ void GridSample3DCudaKernel(const IndexT nthreads,
+                                       IndexT out_c,
+                                       IndexT out_d,
+                                       IndexT out_h,
+                                       IndexT out_w,
+                                       IndexT in_d,
+                                       IndexT in_h,
+                                       IndexT in_w,
+                                       const T* input,
+                                       const T* grid,
+                                       T* output,
+                                       const Mode interpolation_mode,
+                                       const PaddingMode padding_mode,
+                                       bool align_corners) {
+  IndexT inp_sW = 1;
+  IndexT inp_sH = in_w;
+  IndexT inp_sD = in_h * in_w;
+  IndexT inp_sC = in_d * inp_sD;
+  IndexT inp_sN = out_c * inp_sC;
+
+  IndexT grid_sCoor = 1;
+  IndexT grid_sW = 3;
+  IndexT grid_sH = out_w * grid_sW;
+  IndexT grid_sD = out_h * grid_sH;
+  IndexT grid_sN = out_d * grid_sD;
+
+  IndexT out_sW = 1;
+  IndexT out_sH = out_w;
+  IndexT out_sD = out_h * out_w;
+  IndexT out_sC = out_d * out_sD;
+  IndexT out_sN = out_c * out_sC;
+
+  CUDA_KERNEL_LOOP_TYPE(index, nthreads, IndexT) {
+    const IndexT w = index % out_w;
+    const IndexT h = (index / out_w) % out_h;
+    const IndexT d = (index / (out_h * out_w)) % out_d;
+    const IndexT n = index / (out_d * out_h * out_w);
+    const IndexT grid_offset =
+        n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW;
+    // get the corresponding input x, y, z coordinates from grid
+    T ix = grid[grid_offset];
+    T iy = grid[grid_offset + grid_sCoor];
+    T iz = grid[grid_offset + 2 * grid_sCoor];
+    ix = ComputePositions(ix, in_w, padding_mode, align_corners);
+    iy = ComputePositions(iy, in_h, padding_mode, align_corners);
+    iz = ComputePositions(iz, in_d, padding_mode, align_corners);
+    if (interpolation_mode == Mode::bilinear) {
+      // get corner pixel values from (x, y, z)
+      // for 4d, we used north-east-south-west
+      // for 5d, we add top-bottom
+      IndexT ix_tnw = static_cast<IndexT>(std::floor(ix));
+      IndexT iy_tnw = static_cast<IndexT>(std::floor(iy));
+      IndexT iz_tnw = static_cast<IndexT>(std::floor(iz));
+
+      IndexT ix_tne = ix_tnw + 1;
+      IndexT iy_tne = iy_tnw;
+      IndexT iz_tne = iz_tnw;
+
+      IndexT ix_tsw = ix_tnw;
+      IndexT iy_tsw = iy_tnw + 1;
+      IndexT iz_tsw = iz_tnw;
+
+      IndexT ix_tse = ix_tnw + 1;
+      IndexT iy_tse = iy_tnw + 1;
+      IndexT iz_tse = iz_tnw;
+
+      IndexT ix_bnw = ix_tnw;
+      IndexT iy_bnw = iy_tnw;
+      IndexT iz_bnw = iz_tnw + 1;
+
+      IndexT ix_bne = ix_tnw + 1;
+      IndexT iy_bne = iy_tnw;
+      IndexT iz_bne = iz_tnw + 1;
+
+      IndexT ix_bsw = ix_tnw;
+      IndexT iy_bsw = iy_tnw + 1;
+      IndexT iz_bsw = iz_tnw + 1;
+
+      IndexT ix_bse = ix_tnw + 1;
+      IndexT iy_bse = iy_tnw + 1;
+      IndexT iz_bse = iz_tnw + 1;
+
+      // get surfaces to each neighbor:
+      T tnw = (ix_bse - ix) * (iy_bse - iy) * (iz_bse - iz);
+      T tne = (ix - ix_bsw) * (iy_bsw - iy) * (iz_bsw - iz);
+      T tsw = (ix_bne - ix) * (iy - iy_bne) * (iz_bne - iz);
+      T tse = (ix - ix_bnw) * (iy - iy_bnw) * (iz_bnw - iz);
+      T bnw = (ix_tse - ix) * (iy_tse - iy) * (iz - iz_tse);
+      T bne = (ix - ix_tsw) * (iy_tsw - iy) * (iz - iz_tsw);
+      T bsw = (ix_tne - ix) * (iy - iy_tne) * (iz - iz_tne);
+      T bse = (ix - ix_tnw) * (iy - iy_tnw) * (iz - iz_tnw);
+
+      const T* inp_ptr_NC = input + n * inp_sN;
+      T* out_ptr_NCDHW =
+          output + (n * out_sN + d * out_sD + h * out_sH + w * out_sW);
+      for (IndexT c = 0; c < out_c;
+           ++c, inp_ptr_NC += inp_sC, out_ptr_NCDHW += out_sC) {
+        *out_ptr_NCDHW = static_cast<T>(0);
+        if (InBounds3D(iz_tnw, iy_tnw, ix_tnw, in_d, in_h, in_w)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_tnw * inp_sD + iy_tnw * inp_sH + ix_tnw * inp_sW] *
+              tnw;
+        }
+        if (InBounds3D(iz_tne, iy_tne, ix_tne, in_d, in_h, in_w)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_tne * inp_sD + iy_tne * inp_sH + ix_tne * inp_sW] *
+              tne;
+        }
+        if (InBounds3D(iz_tsw, iy_tsw, ix_tsw, in_d, in_h, in_w)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_tsw * inp_sD + iy_tsw * inp_sH + ix_tsw * inp_sW] *
+              tsw;
+        }
+        if (InBounds3D(iz_tse, iy_tse, ix_tse, in_d, in_h, in_w)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_tse * inp_sD + iy_tse * inp_sH + ix_tse * inp_sW] *
+              tse;
+        }
+        if (InBounds3D(iz_bnw, iy_bnw, ix_bnw, in_d, in_h, in_w)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_bnw * inp_sD + iy_bnw * inp_sH + ix_bnw * inp_sW] *
+              bnw;
+        }
+        if (InBounds3D(iz_bne, iy_bne, ix_bne, in_d, in_h, in_w)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_bne * inp_sD + iy_bne * inp_sH + ix_bne * inp_sW] *
+              bne;
+        }
+        if (InBounds3D(iz_bsw, iy_bsw, ix_bsw, in_d, in_h, in_w)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_bsw * inp_sD + iy_bsw * inp_sH + ix_bsw * inp_sW] *
+              bsw;
+        }
+        if (InBounds3D(iz_bse, iy_bse, ix_bse, in_d, in_h, in_w)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_bse * inp_sD + iy_bse * inp_sH + ix_bse * inp_sW] *
+              bse;
+        }
+      }
+    } else if (interpolation_mode == Mode::nearest) {
+      IndexT ix_nearest = static_cast<IndexT>(std::nearbyint(ix));
+      IndexT iy_nearest = static_cast<IndexT>(std::nearbyint(iy));
+      IndexT iz_nearest = static_cast<IndexT>(std::nearbyint(iz));
+
+      // assign nearest neighbor pixel value to output pixel
+      const T* inp_ptr_NC = input + n * inp_sN;
+      T* out_ptr_NCDHW =
+          output + (n * out_sN + d * out_sD + h * out_sH + w * out_sW);
+      for (IndexT c = 0; c < out_c;
+           ++c, inp_ptr_NC += inp_sC, out_ptr_NCDHW += out_sC) {
+        if (InBounds3D(iz_nearest, iy_nearest, ix_nearest, in_d, in_h, in_w)) {
+          *out_ptr_NCDHW =
+              inp_ptr_NC[iz_nearest * inp_sD + iy_nearest * inp_sH +
+                         ix_nearest * inp_sW];
+        } else {
+          *out_ptr_NCDHW = static_cast<T>(0);
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void GridSampleKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& grid,
+                      const std::string& mode,
+                      const std::string& padding_mode,
+                      bool align_corners,
+                      DenseTensor* out) {
+  if (out && out->numel() == 0) {
+    dev_ctx.template Alloc<T>(out);
+    return;
+  }
+  PaddingMode enum_padding_mode;
+  Mode enum_mode;
+  if (padding_mode == "border") {
+    enum_padding_mode = PaddingMode::border;
+  } else if (padding_mode == "reflection") {
+    enum_padding_mode = PaddingMode::reflect;
+  } else {
+    enum_padding_mode = PaddingMode::zeros;
+  }
+
+  if (mode == "nearest") {
+    enum_mode = Mode::nearest;
+  } else {
+    enum_mode = Mode::bilinear;
+  }
+
+#ifndef PADDLE_WITH_HIP
+  if (condCudnnGridSampler<T>(x, grid) &&
+      enum_padding_mode == PaddingMode::zeros && enum_mode == Mode::bilinear &&
+      align_corners) {
+    const int64_t N = x.dims()[0];
+    const int64_t C = x.dims()[1];
+    const int64_t H_in = x.dims()[2];
+    const int64_t W_in = x.dims()[3];
+    const int64_t H_out = grid.dims()[1];
+    const int64_t W_out = grid.dims()[2];
+
+    out->Resize({N, C, H_out, W_out});
+    auto* out_data = dev_ctx.template Alloc<T>(out);
+
+    cudnnHandle_t handle = GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace());
+
+    // Create and set Tensor descriptors (NCHW) for x and out
+    cudnnTensorDescriptor_t x_desc, y_desc;
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnCreateTensorDescriptor(&x_desc));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnCreateTensorDescriptor(&y_desc));
+
+    const cudnnDataType_t cudnn_dtype =
+        std::is_same<T, float>::value ? CUDNN_DATA_FLOAT : CUDNN_DATA_DOUBLE;
+
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnSetTensor4dDescriptor(x_desc,
+                                                 CUDNN_TENSOR_NCHW,
+                                                 cudnn_dtype,
+                                                 static_cast<int>(N),
+                                                 static_cast<int>(C),
+                                                 static_cast<int>(H_in),
+                                                 static_cast<int>(W_in)));
+
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnSetTensor4dDescriptor(y_desc,
+                                                 CUDNN_TENSOR_NCHW,
+                                                 cudnn_dtype,
+                                                 static_cast<int>(N),
+                                                 static_cast<int>(C),
+                                                 static_cast<int>(H_out),
+                                                 static_cast<int>(W_out)));
+
+    // Spatial Transformer descriptor: specifies sampler type and output
+    // dimension (N, C, H_out, W_out)
+    cudnnSpatialTransformerDescriptor_t st_desc;
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnCreateSpatialTransformerDescriptor(&st_desc));
+    int st_dims[4] = {static_cast<int>(N),
+                      static_cast<int>(C),
+                      static_cast<int>(H_out),
+                      static_cast<int>(W_out)};
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnSetSpatialTransformerNdDescriptor(
+            st_desc, CUDNN_SAMPLER_BILINEAR, cudnn_dtype, 4, st_dims));
+
+    const T* x_data = x.data<T>();
+    const T* grid_data = grid.data<T>();
+    using AlphaBetaT = typename std::
+        conditional<std::is_same<T, float>::value, float, double>::type;
+    const AlphaBetaT alpha = static_cast<AlphaBetaT>(1.0);
+    const AlphaBetaT beta = static_cast<AlphaBetaT>(0.0);
+
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSpatialTfSamplerForward(
+        handle,
+        st_desc,
+        static_cast<const void*>(&alpha),
+        x_desc,
+        static_cast<const void*>(x_data),
+        static_cast<const void*>(grid_data),
+        static_cast<const void*>(&beta),
+        y_desc,
+        static_cast<void*>(out_data)));
+
+    // resource release
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnDestroySpatialTransformerDescriptor(st_desc));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnDestroyTensorDescriptor(x_desc));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnDestroyTensorDescriptor(y_desc));
+    return;
+  }
+#endif
+
+  bool use_int32_index = x.numel() <= std::numeric_limits<int>::max() &&
+                         grid.numel() <= std::numeric_limits<int>::max() &&
+                         out->numel() <= std::numeric_limits<int>::max();
+
+  if (x.dims().size() == 4) {
+    const int64_t n = grid.dims()[0];
+    const int64_t out_h = grid.dims()[1];
+    const int64_t out_w = grid.dims()[2];
+    const int64_t c = x.dims()[1];
+    const int64_t in_h = x.dims()[2];
+    const int64_t in_w = x.dims()[3];
+    VLOG(3) << "n: " << n << "; c: " << c << "; out_h: " << out_h
+            << "; out_w: " << out_w;
+
+    auto* output_data = dev_ctx.template Alloc<T>(out);
+    VLOG(3) << "out dims: " << out->dims()[0] << "; " << out->dims()[1] << "; "
+            << out->dims()[2] << "; " << out->dims()[3];
+
+    int64_t count = n * out_h * out_w;
+    auto cu_stream = dev_ctx.stream();
+    backends::gpu::GpuLaunchConfig config =
+        backends::gpu::GetGpuLaunchConfig1D(dev_ctx, count);
+
+#define LAUNCH_KERNEL(INDEX_TYPE)                                         \
+  GridSampleCudaKernel<T, INDEX_TYPE>                                     \
+      <<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>( \
+          n,                                                              \
+          c,                                                              \
+          out_h * out_w,                                                  \
+          in_h,                                                           \
+          in_w,                                                           \
+          x.data<T>(),                                                    \
+          grid.data<T>(),                                                 \
+          output_data,                                                    \
+          enum_mode,                                                      \
+          enum_padding_mode,                                              \
+          align_corners)
+    if (use_int32_index) {
+      LAUNCH_KERNEL(int);
+    } else {
+      LAUNCH_KERNEL(int64_t);
+    }
+#undef LAUNCH_KERNEL
+  } else {
+    const int64_t n = grid.dims()[0];
+    const int64_t out_d = grid.dims()[1];
+    const int64_t out_h = grid.dims()[2];
+    const int64_t out_w = grid.dims()[3];
+    const int64_t c = x.dims()[1];
+    const int64_t in_d = x.dims()[2];
+    const int64_t in_h = x.dims()[3];
+    const int64_t in_w = x.dims()[4];
+
+    VLOG(3) << "n: " << n << "; c: " << c << "; out_d: " << out_d
+            << "; out_h: " << out_h << "; out_w: " << out_w;
+
+    auto* output_data = dev_ctx.template Alloc<T>(out);
+    VLOG(3) << "out dims: " << out->dims()[0] << "; " << out->dims()[1] << "; "
+            << out->dims()[2] << "; " << out->dims()[3] << "; "
+            << out->dims()[4];
+
+    int64_t count = n * out_d * out_h * out_w;
+    auto cu_stream = dev_ctx.stream();
+    backends::gpu::GpuLaunchConfig config =
+        backends::gpu::GetGpuLaunchConfig1D(dev_ctx, count);
+
+#define LAUNCH_KERNEL(INDEX_TYPE)                                         \
+  GridSample3DCudaKernel<T, INDEX_TYPE>                                   \
+      <<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>( \
+          count,                                                          \
+          c,                                                              \
+          out_d,                                                          \
+          out_h,                                                          \
+          out_w,                                                          \
+          in_d,                                                           \
+          in_h,                                                           \
+          in_w,                                                           \
+          x.data<T>(),                                                    \
+          grid.data<T>(),                                                 \
+          output_data,                                                    \
+          enum_mode,                                                      \
+          enum_padding_mode,                                              \
+          align_corners)
+    if (use_int32_index) {
+      LAUNCH_KERNEL(int);
+    } else {
+      LAUNCH_KERNEL(int64_t);
+    }
+#undef LAUNCH_KERNEL
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_PLUGIN_KERNEL(
+    grid_sample, metax_gpu, ALL_LAYOUT, phi::GridSampleKernel, float, double) {}
diff --git a/backends/metax_gpu/kernels/metax_kernel/weight_only_linear_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/weight_only_linear_kernel.cu
index eae8c8c0301..d2f39ccf751 100644
--- a/backends/metax_gpu/kernels/metax_kernel/weight_only_linear_kernel.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/weight_only_linear_kernel.cu
@@ -35,6 +35,7 @@ void WeightOnlyLinearKernel(const Context& dev_ctx,
                             const int32_t group_size,
                             DenseTensor* out) {
   dev_ctx.template Alloc<T>(out);
+  auto stream = dev_ctx.stream();
   const T* x_data = x.data<T>();
   const int8_t* weight_data = weight.data<int8_t>();
   const T* bias_data = bias ? bias.get().data<T>() : nullptr;
@@ -128,7 +129,7 @@ void WeightOnlyLinearKernel(const Context& dev_ctx,
           k,
           n,
           n};
-      mctlass_op(arguments);
+      mctlass_op(arguments, NULL, stream);
     } else {
       mctlassGemmScaleOp_w8a16_bias mctlass_op;
       typename mctlassGemmScaleOp_w8a16_bias::Arguments arguments{

From 89912995a39f939a582aeb953f761a588c89663d Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Tue, 23 Sep 2025 20:02:41 +0800
Subject: [PATCH 045/121] =?UTF-8?q?change=E2=80=94ut=20(#59)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* change_ut
---
 backends/metax_gpu/tests/default.txt | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/backends/metax_gpu/tests/default.txt b/backends/metax_gpu/tests/default.txt
index 9f073d7e92f..9c989161fed 100644
--- a/backends/metax_gpu/tests/default.txt
+++ b/backends/metax_gpu/tests/default.txt
@@ -42,7 +42,6 @@ test_shape_op
 test_tril_triu_op
 test_slice_op
 test_elementwise_add_op
-test_index_put_op
 test_bincount_op
 test_assign_op
 test_logical_op
@@ -73,7 +72,6 @@ test_fractional_max_pool3d_api
 test_nll_loss
 test_is_empty_op
 test_norm_nn_grad
-test_index_fill
 test_floor
 test_slice_scatter
 test_nn_matmul_v2_grad
@@ -127,10 +125,8 @@ test_flip
 test_fused_bias_dropout_residual_layer_norm_op
 test_greater_equal_op
 test_add_op
-test_cartesian_prod
 test_uniform_random_inplace_op
 test_feed_fetch_method
-test_pow_op
 test_conv3d_transpose_op
 test_add_position_encoding_op
 test_imperative_data_loader_base
@@ -223,12 +219,9 @@ test_executor_check_fetch_list
 test_inplace_softmax_with_cross_entropy
 test_cos
 test_imperative_parallel_coalesce_split
-test_grid_sample_function
-test_rnn_decode_api
 test_triu_indices_op
 test_binary_cross_entropy_with_logits_op
 test_mean_op_v1
-test_round_op
 test_assign_pos_op_dygraph
 test_nn_functional_embedding_static
 test_norm_op
@@ -262,7 +255,6 @@ test_diag_v2
 test_complex_transpose
 test_prior_box_op
 test_square_error_cost
-test_fused_rotary_position_embedding
 test_gru_rnn_op
 test_restrict_nonzero
 test_dygraph_weight_norm
@@ -295,7 +287,6 @@ test_argsort_op
 test_layer_norm_op_v2
 test_adaptive_max_pool1d
 test_shard_index_op
-test_cuda_max_memory_allocated
 test_roi_align_op
 test_sin
 test_take

From a770e6f197e8c519712a4a7d2359110d34dc0431 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Tue, 23 Sep 2025 20:50:24 +0800
Subject: [PATCH 046/121] change_ut (#60)

* change_ut

---------
---
 backends/metax_gpu/tests/default.txt | 2 --
 1 file changed, 2 deletions(-)

diff --git a/backends/metax_gpu/tests/default.txt b/backends/metax_gpu/tests/default.txt
index 9c989161fed..21adad68f5b 100644
--- a/backends/metax_gpu/tests/default.txt
+++ b/backends/metax_gpu/tests/default.txt
@@ -28,7 +28,6 @@ test_one_hot_v2_op
 test_fill_any_op
 test_gather_op
 test_reshape_op
-test_index_put_op
 test_bitwise_op
 test_max_op
 test_pad_op
@@ -214,7 +213,6 @@ test_tile_op
 test_adam_optimizer_fp32_fp64
 test_batch_norm_op
 test_gather_nd_op
-test_pow
 test_executor_check_fetch_list
 test_inplace_softmax_with_cross_entropy
 test_cos

From 902112bb8707edebefa747e4994384df27c3f356 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Wed, 24 Sep 2025 10:05:05 +0800
Subject: [PATCH 047/121] change_ut (#63)

* change_ut

* change_ut

---------
---
 backends/metax_gpu/tests/default.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/backends/metax_gpu/tests/default.txt b/backends/metax_gpu/tests/default.txt
index 21adad68f5b..54f0b7c008f 100644
--- a/backends/metax_gpu/tests/default.txt
+++ b/backends/metax_gpu/tests/default.txt
@@ -177,7 +177,6 @@ test_imperative_data_parallel
 test_sigmoid
 test_adaptive_max_pool3d
 test_roll_op
-test_index_put_op
 test_assign_op
 test_amp_check_finite_and_scale_op
 test_strided_slice_op

From cfe44ce24e2e67c595057e0568b7c34f55c08b0a Mon Sep 17 00:00:00 2001
From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Date: Thu, 25 Sep 2025 16:04:11 +0800
Subject: [PATCH 048/121] [Metax] add keyword filter in CI CMakeLists.txt (#64)

* [Metax] add keyword filter in CI CMakeLists.txt

* [Metax] add ignore case list
---
 backends/metax_gpu/tests/CMakeLists.txt | 62 ++++++++++++-------------
 backends/metax_gpu/tests/ignore.txt     | 21 +++++++++
 2 files changed, 50 insertions(+), 33 deletions(-)
 create mode 100644 backends/metax_gpu/tests/ignore.txt

diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt
index e8b11d347d9..0c84ada4b65 100755
--- a/backends/metax_gpu/tests/CMakeLists.txt
+++ b/backends/metax_gpu/tests/CMakeLists.txt
@@ -9,6 +9,8 @@ set(PADDLE_LEGACY_TEST_PATH
     ${CMAKE_CURRENT_LIST_DIR}/../../../Paddle/test/legacy_test)
 set(METAX_UNIT_TEST_PATH ${CMAKE_CURRENT_LIST_DIR}/unit_test)
 
+set(NEED_REMOVE_KEYWORDS "attention")
+
 file(GLOB_RECURSE PYTHON_TEST_SCRIPTS "${METAX_UNIT_TEST_PATH}/*.py")
 
 if(NOT TEST_LIST_FILE)
@@ -33,6 +35,20 @@ else()
 endif()
 
 foreach(test_name ${TEST_PROGRAMS})
+  set(IS_REMOVE FALSE)
+
+  foreach(keyword ${NEED_REMOVE_KEYWORDS})
+    string(FIND "${test_name}" "${keyword}" RES)
+    if(NOT RES EQUAL -1)
+      set(IS_REMOVE TRUE)
+      break()
+    endif()
+  endforeach()
+
+  if(IS_REMOVE)
+    continue()
+  endif()
+
   set(CURRENT_TEST_PROGRAM ${PADDLE_LEGACY_TEST_PATH}/${test_name}.py)
   if(NOT EXISTS ${CURRENT_TEST_PROGRAM})
     message(WARNING "${CURRENT_TEST_PROGRAM} is not exist, skip it.")
@@ -44,39 +60,19 @@ endforeach()
 list(REMOVE_DUPLICATES PYTHON_TEST_SCRIPTS)
 
 if(NOT TEST_LIST_FILE)
-  list(
-    REMOVE_ITEM
-    PYTHON_TEST_SCRIPTS
-    # Metax unit test
-    ${METAX_UNIT_TEST_PATH}/test_matmul_op_metax.py
-    # 精度问题
-    ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py
-    ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py
-    ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py
-    # core.cudnnversion
-    ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py
-    ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py
-    ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py
-    ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py
-    # op_test.py 里 self._get_places()接口的适配问题
-    ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py
-    ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py
-    # device == "gpu" 适配问题
-    ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py
-    # paddle-gpu 报错一致
-    ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py
-    ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py
-    ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py
-    ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py
-    # paddle.device.cuda.get_device_properties
-    ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py
-    ${PADDLE_LEGACY_TEST_PATH}/test_randint_op.py
-    ${PADDLE_LEGACY_TEST_PATH}/test_uniform_random_op.py
-    # needs check_grad with fp64 precision
-    ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py
-    # CUDAPinnedPlace 问题
-    ${PADDLE_LEGACY_TEST_PATH}/test_slice_op.py
-    ${PADDLE_LEGACY_TEST_PATH}/test_compare_op.py)
+  set(NEED_IGNORE_FILE ${CMAKE_CURRENT_LIST_DIR}/ignore.txt)
+  if(EXISTS ${NEED_IGNORE_FILE})
+    file(STRINGS ${NEED_IGNORE_FILE} NEED_IGNORE_TEST_PROGRAMS)
+    foreach(test_name ${NEED_IGNORE_TEST_PROGRAMS})
+      if(EXISTS ${PADDLE_LEGACY_TEST_PATH}/${test_name}.py)
+        list(REMOVE_ITEM PYTHON_TEST_SCRIPTS
+             ${PADDLE_LEGACY_TEST_PATH}/${test_name}.py)
+      else()
+        list(REMOVE_ITEM PYTHON_TEST_SCRIPTS
+             ${METAX_UNIT_TEST_PATH}/${test_name}.py)
+      endif()
+    endforeach()
+  endif()
 endif()
 
 if(LOG_OUTPUT_DIR AND NOT EXISTS ${LOG_OUTPUT_DIR})
diff --git a/backends/metax_gpu/tests/ignore.txt b/backends/metax_gpu/tests/ignore.txt
new file mode 100644
index 00000000000..b4f1afbe5b0
--- /dev/null
+++ b/backends/metax_gpu/tests/ignore.txt
@@ -0,0 +1,21 @@
+test_matmul_op_metax
+test_sum_op
+test_max_op
+test_cumsum_op
+test_softmax_with_cross_entropy_op
+test_softmax_op
+test_elementwise_add_op
+test_gather_op
+test_elementwise_pow_op
+test_layer_norm_op
+test_index_add_op
+test_elementwise_div_op
+test_stack_op
+test_logical_op
+test_mean_op
+test_transpose_op
+test_randint_op
+test_uniform_random_op
+test_c_embedding_op
+test_slice_op
+test_compare_op

From 78946fd334dacbdb3f8ba9b07d9273a8462e8512 Mon Sep 17 00:00:00 2001
From: jxwangmetax <189149612@qq.com>
Date: Fri, 26 Sep 2025 15:48:08 +0800
Subject: [PATCH 049/121] [metax] modify kernels (#67)

* modify cmake for warpctc and warprnnt

* modify conv for tf32 and fp32

* modify conv kernel

* modify library to static library

* modify kernel

* modify fused_bias_dropout_residual_layer_norm

* modify compile

* modify blas

* modify blas

* modify blas

* modify blas

* modify context

* modify kernels
---
 .../fused_conv2d_add_act_kernel_register.cu   |  0
 .../fused_rope_grad_kernel_register.cu        |  0
 .../fused_rope_kernel_register.cu             |  0
 .../kernels/metax_kernel/metax_context.cc     | 26 -------------------
 .../kernels/metax_kernel/metax_context.h      |  3 +--
 5 files changed, 1 insertion(+), 28 deletions(-)
 rename backends/metax_gpu/kernels/{metax_kernel => fusion}/fused_conv2d_add_act_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{metax_kernel => fusion}/fused_rope_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{metax_kernel => fusion}/fused_rope_kernel_register.cu (100%)

diff --git a/backends/metax_gpu/kernels/metax_kernel/fused_conv2d_add_act_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_conv2d_add_act_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/metax_kernel/fused_conv2d_add_act_kernel_register.cu
rename to backends/metax_gpu/kernels/fusion/fused_conv2d_add_act_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/metax_kernel/fused_rope_grad_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_rope_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/metax_kernel/fused_rope_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/fusion/fused_rope_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/metax_kernel/fused_rope_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_rope_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/metax_kernel/fused_rope_kernel_register.cu
rename to backends/metax_gpu/kernels/fusion/fused_rope_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/metax_kernel/metax_context.cc b/backends/metax_gpu/kernels/metax_kernel/metax_context.cc
index efddba5f00b..0712fb75bbe 100644
--- a/backends/metax_gpu/kernels/metax_kernel/metax_context.cc
+++ b/backends/metax_gpu/kernels/metax_kernel/metax_context.cc
@@ -15,24 +15,6 @@
 #include "kernels/metax_kernel/metax_context.h"
 
 namespace phi {
-const bool allow_tf32_cublas = []() -> bool {
-  const char* v = std::getenv("ALLOW_TF32_CUBLAS");
-  if (v) {
-    return std::atoi(v);
-  }
-  return true;
-}();
-
-const bool allow_tf32_cudnn = []() -> bool {
-  const char* v = std::getenv("ALLOW_TF32_CUDNN");
-  if (v) {
-    return std::atoi(v);
-  }
-  return false;
-}();
-
-bool AllowTF32Cublas() { return allow_tf32_cublas; }
-bool AllowTF32Cudnn() { return allow_tf32_cudnn; }
 void DnnWorkspaceHandle::RunFuncSync(
     const std::function<void(void*)>& cudnn_func,
     size_t required_workspace_bytes,
@@ -42,19 +24,11 @@ void DnnWorkspaceHandle::RunFuncSync(
     void* workspace_ptr = nullptr;
     size_t size = ((required_workspace_bytes + 255) >> 8) << 8;
     std::lock_guard<std::mutex> guard(*mtx_);
-#ifdef PADDLE_WITH_HIP
-    auto status = hipMalloc(&workspace_ptr, size);
-#else
     auto status = cudaMalloc(&workspace_ptr, size);
-#endif
     if (status == gpuSuccess) {
       cudnn_func(workspace_ptr);
       phi::backends::gpu::GpuStreamSync(stream_);
-#ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_GPU_SUCCESS(hipFree(workspace_ptr));
-#else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaFree(workspace_ptr));
-#endif
       return;
     }
   }
diff --git a/backends/metax_gpu/kernels/metax_kernel/metax_context.h b/backends/metax_gpu/kernels/metax_kernel/metax_context.h
index 2d761439089..7386811a236 100644
--- a/backends/metax_gpu/kernels/metax_kernel/metax_context.h
+++ b/backends/metax_gpu/kernels/metax_kernel/metax_context.h
@@ -18,6 +18,7 @@
 #include <mutex>
 
 #include "kernels/funcs/blas/cublasLt.h"
+#include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/backends/custom/custom_context.h"
 #include "paddle/phi/backends/gpu/forwards.h"
 #include "paddle/phi/backends/gpu/gpu_decls.h"
@@ -30,8 +31,6 @@
 cublasLtHandle_t GetBlasLtHandle();
 
 namespace phi {
-bool AllowTF32Cublas();
-bool AllowTF32Cudnn();
 class DnnWorkspaceHandle {
  public:
   inline DnnWorkspaceHandle(Allocator* allocator, gpuStream_t stream)

From ac78af20874e28a7d5c3f1beed40762c716213bb Mon Sep 17 00:00:00 2001
From: Theendlessofhell <148317258+Theendlessofhell@users.noreply.github.com>
Date: Fri, 26 Sep 2025 15:48:59 +0800
Subject: [PATCH 050/121] Fix part of the missing kernel issues (#66)

Co-authored-by: root <root@lt-wks-10-0-180-15.pub.metax-tech.com>
---
 .../kernels/cuda_kernels/multinomial_kernel_register.cu      | 3 ++-
 .../kernels/cuda_kernels/take_along_axis_kernel_register.cu  | 5 ++++-
 .../metax_gpu/kernels/metax_kernel/addmm_kernel_register.cu  | 1 +
 .../kernels/metax_kernel/layer_norm_grad_kernel_register.cu  | 1 +
 4 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/backends/metax_gpu/kernels/cuda_kernels/multinomial_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/multinomial_kernel_register.cu
index 622e70728f1..1325fa339b0 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/multinomial_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/multinomial_kernel_register.cu
@@ -21,6 +21,7 @@ PD_CUSTOM_KERNEL_REGISTER(multinomial,
                           phi::MultinomialKernel,
                           phi::dtype::float16,
                           phi::dtype::bfloat16,
-                          float) {
+                          float,
+                          double) {
   kernel->OutputAt(0).SetDataType(phi::DataType::INT64);
 }
diff --git a/backends/metax_gpu/kernels/cuda_kernels/take_along_axis_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/take_along_axis_kernel_register.cu
index 4b23b0820fc..b628552aaaf 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/take_along_axis_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/take_along_axis_kernel_register.cu
@@ -25,4 +25,7 @@ PD_CUSTOM_KERNEL_REGISTER(take_along_axis,
                           int64_t,
                           int,
                           phi::dtype::float16,
-                          phi::dtype::bfloat16) {}
+                          phi::dtype::bfloat16,
+                          uint8_t,  // 支持 uint8
+                          int16_t   // 支持 int16
+) {}
diff --git a/backends/metax_gpu/kernels/metax_kernel/addmm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/addmm_kernel_register.cu
index 287fa8de41a..ead21b1eb7e 100644
--- a/backends/metax_gpu/kernels/metax_kernel/addmm_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/addmm_kernel_register.cu
@@ -22,5 +22,6 @@ PD_REGISTER_PLUGIN_KERNEL(addmm,
                           ALL_LAYOUT,
                           phi::AddmmKernel,
                           float,
+                          double,
                           phi::dtype::float16,
                           phi::dtype::bfloat16) {}
diff --git a/backends/metax_gpu/kernels/metax_kernel/layer_norm_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/layer_norm_grad_kernel_register.cu
index 87c06dab2a4..857dcb6d522 100644
--- a/backends/metax_gpu/kernels/metax_kernel/layer_norm_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/layer_norm_grad_kernel_register.cu
@@ -115,6 +115,7 @@ PD_REGISTER_PLUGIN_KERNEL(layer_norm_grad,
                           ALL_LAYOUT,
                           phi::LayerNormGradKernel,
                           float,
+                          double,
                           phi::dtype::float16,
                           phi::dtype::bfloat16) {
   if (kernel_key.dtype() == phi::DataType::FLOAT16) {

From 4ce9fe6de10402f04917cae8bd0f83bf499bdf1e Mon Sep 17 00:00:00 2001
From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Date: Fri, 26 Sep 2025 18:18:36 +0800
Subject: [PATCH 051/121] [Metax] fix index_elementwise_get kernel (#68)

* [Metax] add keyword filter in CI CMakeLists.txt

* [Metax] add ignore case list

* [Metax] fix phi::backends::gpu::DnnVersion() symbol not found

* Revert "[Metax] fix phi::backends::gpu::DnnVersion() symbol not found"

This reverts commit 087a9c1240f024210d536e543a2fc55db1175529.

* [Metax] fix index_elementwise_get kernel
---
 backends/metax_gpu/CMakeLists.txt                      |  2 +-
 .../index_elementwise_get_kernel_register.cu           | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index bca1ce7aad4..3b74ae39c18 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -326,7 +326,7 @@ file(
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/im2sequence_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/im2sequence_grad_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/increment_kernel.cu
-  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/index_elementwise_get_kernel.cu
+  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/index_elementwise_get_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/index_elementwise_get_grad_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/index_elementwise_put_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/index_elementwise_put_grad_kernel.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_get_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_get_kernel_register.cu
index 5ab3d2a3170..a45a740fc61 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_get_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_get_kernel_register.cu
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/index_elementwise_get_kernel.h"
+#include "paddle/phi/kernels/gpu/index_elementwise_get_kernel.cu"  // NOLINT
 
 PD_CUSTOM_KERNEL_REGISTER(index_elementwise_get,
                           metax_gpu,
@@ -27,7 +27,7 @@ PD_CUSTOM_KERNEL_REGISTER(index_elementwise_get,
                           int64_t,
                           int16_t,
                           uint8_t,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}

From 3c8d0173075d49bef48a909a39f12d325e276f00 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Mon, 29 Sep 2025 10:42:05 +0800
Subject: [PATCH 052/121] [metax]fix patch and fix missing kernel (#72)

* [metax]fix patch and fix missing kernel
---
 backends/metax_gpu/CMakeLists.txt             |  3 +
 .../cuda_kernels/adam_kernel_selected_rows.cu | 41 ++++++++++++
 .../cuda_kernels/einsum_kernel_register.cu    | 16 ++---
 .../lars_momentum_kernel_register.cu          | 29 +++++++++
 .../cuda_kernels/nonzero_kernel_register.cu   |  8 ++-
 .../put_along_axis_kernel_register.cu         |  6 +-
 backends/metax_gpu/patch/paddle.patch         | 65 -------------------
 7 files changed, 90 insertions(+), 78 deletions(-)
 create mode 100644 backends/metax_gpu/kernels/cuda_kernels/adam_kernel_selected_rows.cu
 create mode 100644 backends/metax_gpu/kernels/cuda_kernels/lars_momentum_kernel_register.cu

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index 3b74ae39c18..5930eaaebd2 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -535,6 +535,7 @@ file(
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/selected_rows/gpu/clip_by_norm_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/selected_rows/gpu/uniform_random_batch_size_like_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/selected_rows/gpu/get_tensor_from_selected_rows_kernel.cu
+  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/sparse/batch_norm_grad_kernel.cc
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/sparse/batch_norm_kernel.cc
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/sparse/empty_kernel.cc
@@ -642,6 +643,8 @@ file(
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/rms_norm_kernel.cu
+  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/lars_momentum_kernel.cu
+  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/partial_sum_kernel.cu
   # ############################################################################
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu
   # kernels/kps
diff --git a/backends/metax_gpu/kernels/cuda_kernels/adam_kernel_selected_rows.cu b/backends/metax_gpu/kernels/cuda_kernels/adam_kernel_selected_rows.cu
new file mode 100644
index 00000000000..df4105efbd2
--- /dev/null
+++ b/backends/metax_gpu/kernels/cuda_kernels/adam_kernel_selected_rows.cu
@@ -0,0 +1,41 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/selected_rows_functor.h"
+#include "paddle/phi/kernels/selected_rows/adam_kernel.h"
+
+PD_CUSTOM_KERNEL_REGISTER(adam_dense_param_sparse_grad,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::sr::AdamDenseParamSparseGradKernel,
+                          float,
+                          double,
+                          phi::float16) {
+  // Skip beta1_pow, beta2_pow, skip_update data transform
+  kernel->InputAt(6).SetBackend(phi::Backend::ALL_BACKEND);
+  kernel->InputAt(7).SetBackend(phi::Backend::ALL_BACKEND);
+  kernel->InputAt(9).SetBackend(phi::Backend::ALL_BACKEND);
+
+  if (kernel_key.dtype() == phi::DataType::FLOAT16) {
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(5).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(6).SetDataType(phi::DataType::FLOAT32);
+  }
+  kernel->OutputAt(4).SetBackend(phi::Backend::UNDEFINED);
+  kernel->OutputAt(5).SetBackend(phi::Backend::UNDEFINED);
+}
diff --git a/backends/metax_gpu/kernels/cuda_kernels/einsum_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/einsum_kernel_register.cu
index 444928af78f..0f613b55e9e 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/einsum_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/einsum_kernel_register.cu
@@ -23,10 +23,10 @@ PD_CUSTOM_KERNEL_REGISTER(einsum,
                           phi::EinsumKernel,
                           float,
                           double,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
 
 PD_CUSTOM_KERNEL_REGISTER(einsum_infer,
                           metax_gpu,
@@ -34,7 +34,7 @@ PD_CUSTOM_KERNEL_REGISTER(einsum_infer,
                           phi::EinsumInferKernel,
                           float,
                           double,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
diff --git a/backends/metax_gpu/kernels/cuda_kernels/lars_momentum_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/lars_momentum_kernel_register.cu
new file mode 100644
index 00000000000..5647c806bfd
--- /dev/null
+++ b/backends/metax_gpu/kernels/cuda_kernels/lars_momentum_kernel_register.cu
@@ -0,0 +1,29 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/lars_momentum_kernel.h"
+
+PD_CUSTOM_KERNEL_REGISTER(lars_momentum,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::LarsMomentumKernel,
+                          float,
+                          double,
+                          phi::float16) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16) {
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+  }
+}
diff --git a/backends/metax_gpu/kernels/cuda_kernels/nonzero_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/nonzero_kernel_register.cu
index 1f84b628e84..dc92b2c6d69 100755
--- a/backends/metax_gpu/kernels/cuda_kernels/nonzero_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/nonzero_kernel_register.cu
@@ -23,11 +23,13 @@ PD_CUSTOM_KERNEL_REGISTER(nonzero,
                           int64_t,
                           int,
                           int16_t,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
+                          phi::float16,
+                          phi::bfloat16,
                           bool,
                           float,
-                          double) {
+                          double,
+                          phi::complex64,
+                          phi::complex128) {
   kernel->OutputAt(0).SetDataType(phi::DataType::INT64);
 }
 
diff --git a/backends/metax_gpu/kernels/cuda_kernels/put_along_axis_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/put_along_axis_kernel_register.cu
index 8ff1f5959ab..ca93a8ca079 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/put_along_axis_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/put_along_axis_kernel_register.cu
@@ -23,6 +23,8 @@ PD_CUSTOM_KERNEL_REGISTER(put_along_axis,
                           float,
                           double,
                           int64_t,
+                          uint8_t,
+                          int16_t,
                           int,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16) {}
+                          phi::float16,
+                          phi::bfloat16) {}
diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch
index beefb730bf7..4c06609338c 100755
--- a/backends/metax_gpu/patch/paddle.patch
+++ b/backends/metax_gpu/patch/paddle.patch
@@ -869,19 +869,6 @@ index e838778952..83e805e75a 100644
  
  namespace phi {
  namespace fusion {
-diff --git a/paddle/phi/kernels/gpu/correlation_kernel.cu b/paddle/phi/kernels/gpu/correlation_kernel.cu
-index 4c93778bde..c7bdf8a2cc 100644
---- a/paddle/phi/kernels/gpu/correlation_kernel.cu
-+++ b/paddle/phi/kernels/gpu/correlation_kernel.cu
-@@ -103,7 +103,7 @@ void CorrelationCUDAKernel(const Context &dev_ctx,
-                            int stride2,
-                            int corr_type_multiply,
-                            DenseTensor *out) {
--  bool is_gpu_place = dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU;
-+  bool is_gpu_place = dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU || dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM;
-   PADDLE_ENFORCE_EQ(
-       is_gpu_place,
-       true,
 diff --git a/paddle/phi/kernels/gpu/depthwise_conv.h b/paddle/phi/kernels/gpu/depthwise_conv.h
 index f0cca0f701..02ea957240 100644
 --- a/paddle/phi/kernels/gpu/depthwise_conv.h
@@ -897,19 +884,6 @@ index f0cca0f701..02ea957240 100644
  
  namespace phi {
  // To determine use cudnn or not.
-diff --git a/paddle/phi/kernels/gpu/dgc_kernel.cu b/paddle/phi/kernels/gpu/dgc_kernel.cu
-index c2ddfa1347..c6adf5a6de 100644
---- a/paddle/phi/kernels/gpu/dgc_kernel.cu
-+++ b/paddle/phi/kernels/gpu/dgc_kernel.cu
-@@ -188,7 +188,7 @@ void DGCKernel(const Context& dev_ctx,
-   int buf_size = paddle::communication::dgc::get_buffer_size(k);
-   phi::Allocator::AllocationPtr tmp_ious_data;
- #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
--  if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) {
-+  if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU || dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM) {
-     tmp_ious_data = phi::memory_utils::Alloc(
-         dev_ctx.GetPlace(),
-         buf_size,
 diff --git a/paddle/phi/kernels/gpu/gelu_funcs.h b/paddle/phi/kernels/gpu/gelu_funcs.h
 index 29fa252e96..4ae72b0935 100644
 --- a/paddle/phi/kernels/gpu/gelu_funcs.h
@@ -974,19 +948,6 @@ index 1bdbe1564c..f753b54bc6 100644
  #include "paddle/phi/kernels/impl/qr_kernel_impl.h"
  #include "paddle/phi/kernels/impl/tril_triu_kernel_impl.h"
  #include "paddle/phi/kernels/lstsq_kernel.h"
-diff --git a/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu b/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu
-index 05a977828f..5136608c41 100644
---- a/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu
-+++ b/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu
-@@ -58,7 +58,7 @@ void ShuffleBatchKernel(const Context& dev_ctx,
-   int64_t seed_int = 0;
-   if (seed.initialized()) {
-     const auto& seed_place = seed.place().GetType();
--    bool is_gpu_place = seed_place == phi::AllocationType::GPU;
-+    bool is_gpu_place = seed_place == phi::AllocationType::GPU || seed_place == phi::AllocationType::CUSTOM;
-     if (is_gpu_place) {
-       // NOTE: We have overwritten GetKernelTypeForVar, so seed_place would
-       // not be CUDAPlace in practice. This case would only happen in Python
 diff --git a/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h b/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h
 index 9bc5326c90..79b57a8203 100644
 --- a/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h
@@ -1144,32 +1105,6 @@ index 6f03f76eeb..5fe2c3e7dc 100644
  #include "paddle/phi/kernels/funcs/for_range.h"
  #include "paddle/phi/kernels/funcs/matrix_inverse.h"
  
-diff --git a/paddle/phi/kernels/impl/merged_momentum_impl.h b/paddle/phi/kernels/impl/merged_momentum_impl.h
-index 7b85903776..3f4b298807 100644
---- a/paddle/phi/kernels/impl/merged_momentum_impl.h
-+++ b/paddle/phi/kernels/impl/merged_momentum_impl.h
-@@ -297,7 +297,7 @@ void MergedMomentumInnerCompute(
-                 params_out[idx],
-                 velocities_out[idx]);
-         VLOG(10) << "Launch MergedMomentum cpu kernel.";
--      } else if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) {
-+      } else if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU || dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM) {
-         phi::funcs::ForRange<Context> for_range(
-             static_cast<const Context &>(dev_ctx), params[idx]->numel());
-         const auto grad_type = grads[idx]->dtype();
-diff --git a/paddle/phi/kernels/impl/momentum_kernel_impl.h b/paddle/phi/kernels/impl/momentum_kernel_impl.h
-index de5bcfc30b..eb2a9714f5 100644
---- a/paddle/phi/kernels/impl/momentum_kernel_impl.h
-+++ b/paddle/phi/kernels/impl/momentum_kernel_impl.h
-@@ -457,7 +457,7 @@ void MomentumDenseImpl(const Context& dev_ctx,
-             regularization_coeff,
-             param_out,
-             velocity_out);
--  } else if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) {
-+  } else if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU || dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM) {
-     funcs::ForRange<Context> for_range(dev_ctx, param.numel());
-     const auto grad_type = grad.dtype();
- #define PADDLE_LAUNCH_DENSE_MOMENTUM_KERNEL(__nesterov, __reg_type)     \
 diff --git a/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h b/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h
 index 4099d8b506..baef2cd643 100644
 --- a/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h

From 7303ae2c86253711559c2fe2f0abbc770541fe5e Mon Sep 17 00:00:00 2001
From: jxwangmetax <189149612@qq.com>
Date: Mon, 29 Sep 2025 17:08:34 +0800
Subject: [PATCH 053/121] [metax] modify kernels (#73)

* modify kernels
---
 .../kernels/impl/addmm_kernel_impl.h          |  1 +
 backends/metax_gpu/patch/paddle.patch         | 60 ++++++++++++++++++-
 2 files changed, 60 insertions(+), 1 deletion(-)

diff --git a/backends/metax_gpu/kernels/impl/addmm_kernel_impl.h b/backends/metax_gpu/kernels/impl/addmm_kernel_impl.h
index fb1368b069c..b517b719d49 100644
--- a/backends/metax_gpu/kernels/impl/addmm_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/addmm_kernel_impl.h
@@ -98,6 +98,7 @@ void AddmmKernel(const Context& dev_ctx,
           y_dims[0]));
 
   dev_ctx.template Alloc<T>(out);
+  if (out->numel() == 0) return;
   auto blas = funcs::GetBlas<Context, T>(dev_ctx);
 
   // calc broadcast dim
diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch
index 4c06609338c..69d714ef6e0 100755
--- a/backends/metax_gpu/patch/paddle.patch
+++ b/backends/metax_gpu/patch/paddle.patch
@@ -438,6 +438,21 @@ index d69eb67d6f..1d8b6e9375 100644
  #include "paddle/phi/kernels/funcs/eigen/common.h"
  #include "paddle/phi/kernels/funcs/math_function.h"
  
+diff --git a/paddle/phi/kernels/funcs/embedding_grad.h b/paddle/phi/kernels/funcs/embedding_grad.h
+index 461e6e2474..48a64ae9ce 100644
+--- a/paddle/phi/kernels/funcs/embedding_grad.h
++++ b/paddle/phi/kernels/funcs/embedding_grad.h
+@@ -143,8 +143,8 @@ void LaunchEmbeddingGradDeterministicKernel(const GPUContext& dev_ctx,
+   constexpr int kWarpSize = 64;
+   constexpr int kBlockDimY = 16;
+ #else
+-  constexpr int kWarpSize = 32;
+-  constexpr int kBlockDimY = 32;
++  constexpr int kWarpSize = 64;
++  constexpr int kBlockDimY = 16;
+ #endif
+   dim3 threads(kWarpSize, kBlockDimY);
+   dim3 grids(static_cast<int>((D + kWarpSize - 1) / kWarpSize));
 diff --git a/paddle/phi/kernels/funcs/fc_functor.cu b/paddle/phi/kernels/funcs/fc_functor.cu
 index cb35feee32..64f5bd24ac 100644
 --- a/paddle/phi/kernels/funcs/fc_functor.cu
@@ -501,6 +516,49 @@ index 15e1a4a3c3..e4780538d7 100644
  #include "paddle/phi/kernels/funcs/im2col.h"
  
  namespace phi {
+diff --git a/paddle/phi/kernels/funcs/math_cuda_utils.h b/paddle/phi/kernels/funcs/math_cuda_utils.h
+index e5361b836e..5ad238df08 100644
+--- a/paddle/phi/kernels/funcs/math_cuda_utils.h
++++ b/paddle/phi/kernels/funcs/math_cuda_utils.h
+@@ -175,12 +175,12 @@ struct KeyValuePair<half> {
+ #define WARP_SIZE_WIDTH_MASK 0x3f
+ typedef u_int64_t warp_mask_t;
+ #else
+-#define FINAL_MASK 0xffffffff
+-#define HALF_WARP 16
+-#define WARP_SIZE 32
+-#define WARP_SIZE_WIDTH 5
+-#define WARP_SIZE_WIDTH_MASK 0x1f
+-typedef unsigned warp_mask_t;
++#define FINAL_MASK 0xffffffffffffffffUL
++#define HALF_WARP 32
++#define WARP_SIZE 64
++#define WARP_SIZE_WIDTH 6
++#define WARP_SIZE_WIDTH_MASK 0x3f
++typedef u_int64_t warp_mask_t;
+ #endif
+ 
+ template <typename T>
+@@ -200,19 +200,13 @@ __inline__ __device__ T BlockReduceSum(T val, warp_mask_t mask) {
+   static __shared__ T shared[WARP_SIZE];
+   int lane = threadIdx.x & WARP_SIZE_WIDTH_MASK;
+   int wid = threadIdx.x >> WARP_SIZE_WIDTH;
+-
+   val = WarpReduceSum<T>(val, mask);
+-
+-  __syncthreads();
+   if (lane == 0) shared[wid] = val;
+-
+   __syncthreads();
+-
+   // align block_span to warpSize
+   int block_span = (blockDim.x + warpSize - 1) >> WARP_SIZE_WIDTH;
+   val = (lane < block_span) ? shared[lane] : static_cast<T>(0.0f);
+   val = WarpReduceSum<T>(val, mask);
+-
+   return val;
+ }
+ 
 diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cu b/paddle/phi/kernels/funcs/matrix_inverse.cu
 index e101224970..a52eb6096f 100644
 --- a/paddle/phi/kernels/funcs/matrix_inverse.cu
@@ -534,7 +592,7 @@ index 558d363b39..05da04b517 100644
  #include "paddle/phi/kernels/funcs/scatter.cu.h"
  
 diff --git a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
-index 8b0baf5f5f..260482f124 100644
+index 047f52bd91..a05b34d3ba 100644
 --- a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
 +++ b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
 @@ -27,7 +27,7 @@ namespace cub = hipcub;

From 8b184a32bd9e02c0d8b405d670a8e888a4522f42 Mon Sep 17 00:00:00 2001
From: jxwangmetax <189149612@qq.com>
Date: Mon, 29 Sep 2025 18:11:03 +0800
Subject: [PATCH 054/121] [metax] modify kernels (#74)

* modify kernels
---
 .../gpudnn/conv_grad_kernel_register.cu       | 37 ++++++++-----------
 .../kernels/gpudnn/conv_kernel_register.cu    | 19 +++++-----
 .../kernels/gpudnn/conv_transpose_kernel.cu   | 15 ++++----
 .../depthwise_conv_grad_kernel.cu             | 14 +++----
 .../metax_kernel/depthwise_conv_kernel.cu     | 14 +++----
 5 files changed, 45 insertions(+), 54 deletions(-)

diff --git a/backends/metax_gpu/kernels/gpudnn/conv_grad_kernel_register.cu b/backends/metax_gpu/kernels/gpudnn/conv_grad_kernel_register.cu
index e4acb2f95b6..2da42c7ff8c 100644
--- a/backends/metax_gpu/kernels/gpudnn/conv_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/gpudnn/conv_grad_kernel_register.cu
@@ -437,26 +437,22 @@ void ConvCudnnGradKernel(const Context& dev_ctx,
     dev_ctx.template Alloc<T>(filter_grad);
   }
 
-  //   bool has_use_addto = dev_ctx.HasDnnAttr("use_addto");
-  bool has_use_addto = "true";
+  bool has_use_addto = dev_ctx.HasDnnAttr("use_addto");
   VLOG(4) << "GPUContext contains `use_addto`: " << has_use_addto;
-  //   bool use_addto = has_use_addto
-  //                        ? PADDLE_GET_CONST(bool, "true")
-  //                        : false;
-  bool use_addto = "true";
+  bool use_addto = has_use_addto
+                       ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("use_addto"))
+                       : false;
   std::vector<int> dilations = dilations_t;
   std::vector<int> strides = strides_t;
   std::vector<int> paddings = paddings_t;
 
-  //   bool has_exhaustive_search = dev_ctx.HasDnnAttr("exhaustive_search");
-  bool has_exhaustive_search = "true";
+  bool has_exhaustive_search = dev_ctx.HasDnnAttr("exhaustive_search");
   VLOG(4) << "GPUContext contains `exhaustive_search`: "
           << has_exhaustive_search;
-  //   bool exhaustive_search_attr =
-  //       has_exhaustive_search
-  //           ? PADDLE_GET_CONST(bool, "true")
-  //           : false;
-  bool exhaustive_search_attr = "true";
+  bool exhaustive_search_attr =
+      has_exhaustive_search
+          ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("exhaustive_search"))
+          : false;
   bool exhaustive_search =
       FLAGS_cudnn_exhaustive_search || exhaustive_search_attr;
   bool deterministic = FLAGS_cudnn_deterministic;
@@ -835,14 +831,13 @@ void ConvCudnnGradGradKernel(
   T* transformed_dx = nullptr;
   std::vector<int> dilations = dilations_t;
 
-  //   bool has_exhaustive_search = dev_ctx.HasDnnAttr("exhaustive_search");
-  //   VLOG(4) << "GPUContext contains `exhaustive_search`: "
-  //           << has_exhaustive_search;
-  //   bool exhaustive_search_attr =
-  //       has_exhaustive_search
-  //           ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("exhaustive_search"))
-  //           : false;
-  bool exhaustive_search_attr = "true";
+  bool has_exhaustive_search = dev_ctx.HasDnnAttr("exhaustive_search");
+  VLOG(4) << "GPUContext contains `exhaustive_search`: "
+          << has_exhaustive_search;
+  bool exhaustive_search_attr =
+      has_exhaustive_search
+          ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("exhaustive_search"))
+          : false;
   bool exhaustive_search =
       FLAGS_cudnn_exhaustive_search || exhaustive_search_attr;
   bool deterministic = FLAGS_cudnn_deterministic;
diff --git a/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu b/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu
index 0a83b504c76..d6b243c956c 100644
--- a/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu
+++ b/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu
@@ -228,15 +228,16 @@ void ConvCudnnKernel(const Context& dev_ctx,
   std::vector<int> paddings = paddings_t;
   std::vector<int> dilations = dilations_t;
 
-  // bool has_exhaustive_search = dev_ctx.HasDnnAttr("exhaustive_search");
-  // VLOG(4) << "GPUContext contains `exhaustive_search`: "
-  //         << has_exhaustive_search;
-  // bool exhaustive_search_attr =
-  //     has_exhaustive_search
-  //         ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("exhaustive_search"))
-  //         : false;
-
-  bool exhaustive_search = FLAGS_cudnn_exhaustive_search;
+  bool has_exhaustive_search = dev_ctx.HasDnnAttr("exhaustive_search");
+  VLOG(4) << "GPUContext contains `exhaustive_search`: "
+          << has_exhaustive_search;
+  bool exhaustive_search_attr =
+      has_exhaustive_search
+          ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("exhaustive_search"))
+          : false;
+
+  bool exhaustive_search =
+      FLAGS_cudnn_exhaustive_search || exhaustive_search_attr;
   bool deterministic = FLAGS_cudnn_deterministic;
 
   PADDLE_ENFORCE_EQ(exhaustive_search && deterministic,
diff --git a/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu b/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu
index 532b7af0db4..4049d2f3130 100644
--- a/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu
+++ b/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu
@@ -260,14 +260,13 @@ void ConvTransposeRawGPUDNNKernel(const Context& dev_ctx,
     return;
   }
 
-  // bool has_exhaustive_search = dev_ctx.HasDnnAttr("exhaustive_search");
-  // bool exhaustive_search_attr =
-  //     has_exhaustive_search
-  //         ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("exhaustive_search"))
-  //         : false;
-  // bool exhaustive_search =
-  //     FLAGS_cudnn_exhaustive_search || exhaustive_search_attr;
-  bool exhaustive_search = FLAGS_cudnn_exhaustive_search;
+  bool has_exhaustive_search = dev_ctx.HasDnnAttr("exhaustive_search");
+  bool exhaustive_search_attr =
+      has_exhaustive_search
+          ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("exhaustive_search"))
+          : false;
+  bool exhaustive_search =
+      FLAGS_cudnn_exhaustive_search || exhaustive_search_attr;
 
   bool deterministic = FLAGS_cudnn_deterministic;
   PADDLE_ENFORCE_EQ(exhaustive_search && deterministic,
diff --git a/backends/metax_gpu/kernels/metax_kernel/depthwise_conv_grad_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/depthwise_conv_grad_kernel.cu
index f2475298963..4e5f881385a 100644
--- a/backends/metax_gpu/kernels/metax_kernel/depthwise_conv_grad_kernel.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/depthwise_conv_grad_kernel.cu
@@ -54,14 +54,12 @@ void DepthwiseConvGradKernel(const Context& dev_ctx,
     return;
   }
 
-  // bool has_fuse_relu = dev_ctx.HasDnnAttr("fuse_relu_before_depthwise_conv");
-  // bool fuse_relu =
-  //     has_fuse_relu
-  //         ? PADDLE_GET_CONST(
-  //               bool, dev_ctx.GetDnnAttr("fuse_relu_before_depthwise_conv"))
-  //         : false;
-  bool has_fuse_relu = false;
-  bool fuse_relu = false;
+  bool has_fuse_relu = dev_ctx.HasDnnAttr("fuse_relu_before_depthwise_conv");
+  bool fuse_relu =
+      has_fuse_relu
+          ? PADDLE_GET_CONST(
+                bool, dev_ctx.GetDnnAttr("fuse_relu_before_depthwise_conv"))
+          : false;
 
   std::vector<int> strides = strides_t;
   std::vector<int> paddings = paddings_t;
diff --git a/backends/metax_gpu/kernels/metax_kernel/depthwise_conv_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/depthwise_conv_kernel.cu
index 517f26b1c02..d3d6c4a4edd 100644
--- a/backends/metax_gpu/kernels/metax_kernel/depthwise_conv_kernel.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/depthwise_conv_kernel.cu
@@ -48,14 +48,12 @@ void DepthwiseConvKernel(const Context& dev_ctx,
 
   const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
 
-  // bool has_fuse_relu = dev_ctx.HasDnnAttr("fuse_relu_before_depthwise_conv");
-  // bool fuse_relu =
-  //     has_fuse_relu
-  //         ? PADDLE_GET_CONST(
-  //               bool, dev_ctx.GetDnnAttr("fuse_relu_before_depthwise_conv"))
-  //         : false;
-  bool has_fuse_relu = false;
-  bool fuse_relu = false;
+  bool has_fuse_relu = dev_ctx.HasDnnAttr("fuse_relu_before_depthwise_conv");
+  bool fuse_relu =
+      has_fuse_relu
+          ? PADDLE_GET_CONST(
+                bool, dev_ctx.GetDnnAttr("fuse_relu_before_depthwise_conv"))
+          : false;
 
   if (channel_last) {
     PADDLE_ENFORCE_EQ(

From 60f0ed637f73305e8f0fbd03917e3c8e2978d1ef Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Tue, 30 Sep 2025 11:33:54 +0800
Subject: [PATCH 055/121] [metax] link mccl and fix missing kernel (#76)

* [metax] link mccl and fix missing kernel
---
 backends/metax_gpu/CMakeLists.txt             |   7 +
 .../cross_entropy_bwd_w_downcast.cu           | 291 ++++++++++++
 .../embedding_grad_add_to_kernel.cu           |  27 ++
 .../cuda_kernels/gammaln_grad_kernel.cu       |  28 ++
 .../moe_combine_no_weight_grad_kernel.cu      |  25 +
 .../cuda_kernels/multihead_matmul_kernel.cu   | 433 ++++++++++++++++++
 backends/metax_gpu/kernels/funcs/generator.cc | 287 ++++++++++++
 .../kernels/impl/gammaln_grad_kernel_impl.h   | 112 +++++
 .../metax_kernel/cudnn_lstm_grad_kernel.cu    | 362 +++++++++++++++
 .../kernels/metax_kernel/cudnn_lstm_kernel.cu | 428 +++++++++++++++++
 backends/metax_gpu/tests/ignore.txt           |   4 +
 11 files changed, 2004 insertions(+)
 create mode 100644 backends/metax_gpu/kernels/cuda_kernels/cross_entropy_bwd_w_downcast.cu
 create mode 100644 backends/metax_gpu/kernels/cuda_kernels/embedding_grad_add_to_kernel.cu
 create mode 100644 backends/metax_gpu/kernels/cuda_kernels/gammaln_grad_kernel.cu
 create mode 100644 backends/metax_gpu/kernels/cuda_kernels/moe_combine_no_weight_grad_kernel.cu
 create mode 100644 backends/metax_gpu/kernels/cuda_kernels/multihead_matmul_kernel.cu
 create mode 100644 backends/metax_gpu/kernels/funcs/generator.cc
 create mode 100644 backends/metax_gpu/kernels/impl/gammaln_grad_kernel_impl.h
 create mode 100644 backends/metax_gpu/kernels/metax_kernel/cudnn_lstm_grad_kernel.cu
 create mode 100644 backends/metax_gpu/kernels/metax_kernel/cudnn_lstm_kernel.cu

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index 5930eaaebd2..2bb282cf54f 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -326,6 +326,8 @@ file(
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/im2sequence_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/im2sequence_grad_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/increment_kernel.cu
+  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/embedding_grad_add_to_kernel.cu
+  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/cross_entropy_bwd_w_downcast.cu
   # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/index_elementwise_get_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/index_elementwise_get_grad_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/index_elementwise_put_kernel.cu
@@ -728,6 +730,11 @@ target_link_libraries(
   ${WARPCTC_LIBRARIES}
   ${WARPRNNT_LIBRARIES}
   ${PADDLE_CORE_LIB})
+
+target_link_libraries(${TARGET_NAME} /opt/maca/lib/libmccl.so)
+target_link_libraries(${TARGET_NAME} /opt/maca/lib/libmcFlashAttn.so)
+target_link_libraries(${TARGET_NAME} /opt/maca/lib/libmcpti.so)
+
 include_directories(BEFORE ${PADDLE_SOURCE_DIR})
 
 target_compile_definitions(
diff --git a/backends/metax_gpu/kernels/cuda_kernels/cross_entropy_bwd_w_downcast.cu b/backends/metax_gpu/kernels/cuda_kernels/cross_entropy_bwd_w_downcast.cu
new file mode 100644
index 00000000000..a0d5dfd7a5a
--- /dev/null
+++ b/backends/metax_gpu/kernels/cuda_kernels/cross_entropy_bwd_w_downcast.cu
@@ -0,0 +1,291 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/cross_entropy_grad_kernel.h"
+
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include "kernels/gpudnn/softmax_gpudnn.h"
+#include "paddle/phi/backends/gpu/gpu_device_function.h"
+#include "paddle/phi/backends/gpu/gpu_dnn.h"
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/core/visit_type.h"
+#include "paddle/phi/kernels/funcs/axis_utils.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/softmax.h"
+
+namespace phi {
+
+/*
+  Vectorized wrapper of softmax with cross entropy grad hard label.
+  Optimized with float4 vectorization for memory coalescing and improved
+  throughput.
+*/
+template <typename T, typename LabelT, typename LogitT>
+__global__ void SoftmaxWithCrossEntropyGradHardLabelVectorized(
+    LogitT* __restrict__ logits_grad,
+    const T* __restrict__ loss_grad,
+    const T* __restrict__ softmax,
+    const LabelT* __restrict__ labels,
+    const int64_t n,
+    const int64_t dim,
+    const int64_t d,
+    const int ignore_index) {
+  // Vectorized load/store with float4 for 128-bit memory transactions
+  constexpr int VEC_SIZE = 4;
+  using VecT = typename phi::AlignedVector<LogitT, VEC_SIZE>;
+  using SoftmaxVecT = typename phi::AlignedVector<T, VEC_SIZE>;
+
+  int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int64_t vec_id = tid * VEC_SIZE;
+
+  // Ensure we don't exceed bounds
+  if (vec_id >= n * dim * d) return;
+
+  // Compute indices for vectorized access
+  int64_t idx_n = vec_id / (d * dim);
+  int64_t idx_dim_start = (vec_id / d) % dim;
+  int64_t idx_d = vec_id % d;
+  int64_t ids = idx_n * d + idx_d;
+
+  // Load label once per thread
+  auto lbl = static_cast<int64_t>(labels[ids]);
+
+  if (lbl == ignore_index) {
+    // Vectorized zero fill for ignore_index
+    VecT* vec_grad = reinterpret_cast<VecT*>(&logits_grad[vec_id]);
+    VecT zero_vec;
+#pragma unroll
+    for (int i = 0; i < VEC_SIZE; ++i) {
+      zero_vec.val[i] = static_cast<LogitT>(0.0f);
+    }
+    *vec_grad = zero_vec;
+    return;
+  }
+
+  // Vectorized load of softmax values
+  SoftmaxVecT softmax_vec;
+  const SoftmaxVecT* softmax_ptr =
+      reinterpret_cast<const SoftmaxVecT*>(&softmax[vec_id]);
+  softmax_vec = *softmax_ptr;
+
+  // Load loss gradient (broadcast across vector elements)
+  T loss_grad_val = loss_grad[ids];
+
+  // Vectorized computation
+  VecT grad_vec;
+#pragma unroll
+  for (int i = 0; i < VEC_SIZE; ++i) {
+    int64_t current_dim = idx_dim_start + i;
+    if (current_dim < dim) {  // Bounds check for partial vectors
+      float softmax_val = static_cast<float>(softmax_vec.val[i]);
+      float grad_val;
+
+      if (lbl == current_dim) {
+        grad_val = (softmax_val - 1.0f) * static_cast<float>(loss_grad_val);
+      } else {
+        grad_val = softmax_val * static_cast<float>(loss_grad_val);
+      }
+
+      grad_vec.val[i] = static_cast<LogitT>(grad_val);
+    } else {
+      grad_vec.val[i] = static_cast<LogitT>(0.0f);
+    }
+  }
+
+  // Vectorized store
+  VecT* grad_ptr = reinterpret_cast<VecT*>(&logits_grad[vec_id]);
+  *grad_ptr = grad_vec;
+}
+
+/*
+  Specialized kernel for dimensions not divisible by vector size
+  Uses warp-level primitives for better performance on irregular sizes
+*/
+template <typename T, typename LabelT, typename LogitT>
+__global__ void SoftmaxWithCrossEntropyGradHardLabelWarp(
+    LogitT* __restrict__ logits_grad,
+    const T* __restrict__ loss_grad,
+    const T* __restrict__ softmax,
+    const LabelT* __restrict__ labels,
+    const int64_t n,
+    const int64_t dim,
+    const int64_t d,
+    const int ignore_index) {
+  const int warps_per_block = 4;
+  const int threads_per_warp = 32;
+  const int threads_per_block = warps_per_block * threads_per_warp;
+
+  int tid = blockIdx.x * threads_per_block + threadIdx.x;
+  int warp_id = threadIdx.x / threads_per_warp;
+  int lane_id = threadIdx.x % threads_per_warp;
+
+  // Process multiple elements per thread using warp-level parallelism
+  int64_t elements_per_thread =
+      (n * dim * d + gridDim.x * threads_per_block - 1) /
+      (gridDim.x * threads_per_block);
+
+  for (int e = 0; e < elements_per_thread; ++e) {
+    int64_t idx = tid + e * gridDim.x * threads_per_block;
+    if (idx >= n * dim * d) break;
+
+    int64_t idx_n = idx / (d * dim);
+    int64_t idx_dim = (idx / d) % dim;
+    int64_t idx_d = idx % d;
+    int64_t ids = idx_n * d + idx_d;
+
+    auto lbl = static_cast<int64_t>(labels[ids]);
+
+    if (lbl == ignore_index) {
+      logits_grad[idx] = static_cast<LogitT>(0.0f);
+    } else if (lbl == idx_dim) {
+      logits_grad[idx] =
+          static_cast<LogitT>((static_cast<float>(softmax[idx]) - 1.0f) *
+                              static_cast<float>(loss_grad[ids]));
+    } else {
+      logits_grad[idx] =
+          static_cast<LogitT>(static_cast<float>(softmax[idx]) *
+                              static_cast<float>(loss_grad[ids]));
+    }
+  }
+}
+
+/*
+  Optimized kernel selector based on problem size and alignment
+*/
+template <typename T, typename LabelT, typename LogitT>
+void LaunchOptimizedCrossEntropyGradKernel(const GPUContext& dev_ctx,
+                                           LogitT* logits_grad,
+                                           const T* loss_grad,
+                                           const T* softmax,
+                                           const LabelT* labels,
+                                           const int64_t n,
+                                           const int64_t dim,
+                                           const int64_t d,
+                                           const int ignore_index) {
+  const int64_t total_elements = n * dim * d;
+  auto stream = dev_ctx.stream();
+
+  // Check alignment for vectorized kernel
+  bool is_aligned = (reinterpret_cast<uintptr_t>(logits_grad) % 16 == 0) &&
+                    (reinterpret_cast<uintptr_t>(softmax) % 16 == 0) &&
+                    (total_elements % 4 == 0);
+
+  if (is_aligned && total_elements >= 1024) {
+    // Use vectorized kernel for aligned, large problems
+    constexpr int VEC_SIZE = 4;
+    const int threads_per_block = 256;
+    const int vec_elements = total_elements / VEC_SIZE;
+    const int blocks =
+        (vec_elements + threads_per_block - 1) / threads_per_block;
+
+    SoftmaxWithCrossEntropyGradHardLabelVectorized<T, LabelT, LogitT>
+        <<<blocks, threads_per_block, 0, stream>>>(
+            logits_grad, loss_grad, softmax, labels, n, dim, d, ignore_index);
+  } else {
+    // Use warp-specialized kernel for irregular sizes
+    const int warps_per_block = 4;
+    const int threads_per_block = warps_per_block * 32;
+    const int blocks =
+        std::min(1024,
+                 static_cast<int>((total_elements + threads_per_block - 1) /
+                                  threads_per_block));
+
+    SoftmaxWithCrossEntropyGradHardLabelWarp<T, LabelT, LogitT>
+        <<<blocks, threads_per_block, 0, stream>>>(
+            logits_grad, loss_grad, softmax, labels, n, dim, d, ignore_index);
+  }
+}
+
+template <typename T, typename LabelT>
+void CrossEntropyWithSoftmaxBwdWithDowncastGPUKernel(
+    const GPUContext& dev_ctx,
+    const DenseTensor& label,
+    const DenseTensor& softmax,
+    const DenseTensor& loss_grad,
+    int axis,
+    DenseTensor* logits_grad) {
+  //   PADDLE_ENFORCE_EQ(
+  //       dev_ctx.GetPlace().GetType(),
+  //       phi::AllocationType::GPU,
+  //       common::errors::Unavailable("softmax_with_cross_entropy operator's "
+  //                                   "CUDA kernel only runs on GPU device."));
+
+  using LogitT = phi::bfloat16;
+  const T* loss_grad_data = loss_grad.data<T>();
+  DenseTensor* logit_grad = logits_grad;
+
+  LogitT* logit_grad_data = nullptr;
+  logit_grad_data = dev_ctx.template Alloc<LogitT>(logit_grad);
+
+  const int rank = logit_grad->dims().size();
+  const int axis_v = phi::funcs::CanonicalAxis(axis, rank);
+  int axis_dim = logit_grad->dims()[axis_v];
+
+  const int64_t n = phi::funcs::SizeToAxis(axis_v, logit_grad->dims());
+  const int64_t d = phi::funcs::SizeFromAxis(axis_v, logit_grad->dims());
+  const int64_t remain = d / axis_dim;
+
+  const T* softmax_data = softmax.data<T>();
+  const auto* label_data = label.data<LabelT>();
+
+  // Launch optimized kernel with automatic selection
+  LaunchOptimizedCrossEntropyGradKernel<T, LabelT, LogitT>(dev_ctx,
+                                                           logit_grad_data,
+                                                           loss_grad_data,
+                                                           softmax_data,
+                                                           label_data,
+                                                           n,
+                                                           axis_dim,
+                                                           remain,
+                                                           -100);
+}
+
+template <typename T, typename Context>
+void CrossEntropyWithSoftmaxBwdWithDowncastKernel(const Context& dev_ctx,
+                                                  const DenseTensor& label,
+                                                  const DenseTensor& softmax,
+                                                  const DenseTensor& loss_grad,
+                                                  DenseTensor* logits_grad) {
+  constexpr int axis = -1;
+  if (logits_grad->numel() == 0) {
+    dev_ctx.template Alloc<phi::bfloat16>(logits_grad);
+    return;
+  }
+  auto dtype = label.dtype();
+  PD_VISIT_INTEGRAL_TYPES(
+      dtype, "CrossEntropyWithSoftmaxBwdWithDowncastGPUKernel", ([&] {
+        CrossEntropyWithSoftmaxBwdWithDowncastGPUKernel<T, data_t>(
+            dev_ctx, label, softmax, loss_grad, axis, logits_grad);
+      }));
+}
+
+}  // namespace phi
+
+PD_REGISTER_PLUGIN_KERNEL(cross_entropy_with_softmax_bwd_w_downcast,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::CrossEntropyWithSoftmaxBwdWithDowncastKernel,
+                          float,
+                          double,
+                          phi::float16) {}
diff --git a/backends/metax_gpu/kernels/cuda_kernels/embedding_grad_add_to_kernel.cu b/backends/metax_gpu/kernels/cuda_kernels/embedding_grad_add_to_kernel.cu
new file mode 100644
index 00000000000..6b20feee0fd
--- /dev/null
+++ b/backends/metax_gpu/kernels/cuda_kernels/embedding_grad_add_to_kernel.cu
@@ -0,0 +1,27 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/embedding_grad_kernel.h"
+#include "paddle/phi/kernels/funcs/embedding_grad.h"
+#include "paddle/phi/kernels/gpu/embedding_grad_add_to_kernel.cu"  // NOLINT
+
+PD_CUSTOM_KERNEL_REGISTER(embedding_grad_add_to,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::EmbeddingGradAddToAddToKernel,
+                          float,
+                          double,
+                          phi::float16,
+                          phi::bfloat16) {}
diff --git a/backends/metax_gpu/kernels/cuda_kernels/gammaln_grad_kernel.cu b/backends/metax_gpu/kernels/cuda_kernels/gammaln_grad_kernel.cu
new file mode 100644
index 00000000000..c6bd53f007f
--- /dev/null
+++ b/backends/metax_gpu/kernels/cuda_kernels/gammaln_grad_kernel.cu
@@ -0,0 +1,28 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "kernels/impl/gammaln_grad_kernel_impl.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gammaln_grad_kernel.h"
+
+PD_REGISTER_PLUGIN_KERNEL(gammaln_grad,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::GammalnGradKernel,
+                          float,
+                          double,
+                          phi::float16,
+                          phi::bfloat16) {}
diff --git a/backends/metax_gpu/kernels/cuda_kernels/moe_combine_no_weight_grad_kernel.cu b/backends/metax_gpu/kernels/cuda_kernels/moe_combine_no_weight_grad_kernel.cu
new file mode 100644
index 00000000000..e6984cf86d2
--- /dev/null
+++ b/backends/metax_gpu/kernels/cuda_kernels/moe_combine_no_weight_grad_kernel.cu
@@ -0,0 +1,25 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/legacy/gpu/moe_combine_no_weight_grad_kernel.cu"  // NOLINT
+
+PD_CUSTOM_KERNEL_REGISTER(moe_combine_no_weight_grad,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::MoeCombineNoWeightGradKernel,
+                          float,
+                          double,
+                          phi::bfloat16,
+                          phi::float16) {}
diff --git a/backends/metax_gpu/kernels/cuda_kernels/multihead_matmul_kernel.cu b/backends/metax_gpu/kernels/cuda_kernels/multihead_matmul_kernel.cu
new file mode 100644
index 00000000000..151c929e41c
--- /dev/null
+++ b/backends/metax_gpu/kernels/cuda_kernels/multihead_matmul_kernel.cu
@@ -0,0 +1,433 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <type_traits>
+
+#include "kernels/funcs/blas/blas.h"
+#include "paddle/common/errors.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/kernels/funcs/multihead_matmul_functor.h"
+
+namespace phi {
+namespace fusion {
+
+template <typename T>
+__global__ void transpose(T *src,
+                          T *dst,
+                          const int batch_size,
+                          const int seq_len,
+                          const int head_num,
+                          const int size_per_head) {
+  int batch_id = blockIdx.x / (head_num * seq_len);
+  int seq_id = blockIdx.x % seq_len;
+  int head_id = (blockIdx.x % (head_num * seq_len)) / seq_len;
+  dst[batch_id * (head_num * seq_len * size_per_head) +
+      seq_id * head_num * size_per_head + head_id * size_per_head +
+      threadIdx.x] = src[blockIdx.x * size_per_head + threadIdx.x];
+}
+
+template <typename T>
+inline __device__ T add_func(T a, T b);
+
+template <>
+__device__ float add_func<float>(float a, float b) {
+  return a + b;
+}
+
+template <>
+__device__ float2 add_func<float2>(float2 a, float2 b) {
+  float2 c;
+  c.x = a.x + b.x;
+  c.y = a.y + b.y;
+  return c;
+}
+
+template <>
+__device__ float4 add_func<float4>(float4 a, float4 b) {
+  float4 c;
+  c.x = a.x + b.x;
+  c.y = a.y + b.y;
+  c.z = a.z + b.z;
+  c.w = a.w + b.w;
+  return c;
+}
+#if defined(PADDLE_WITH_CUDA)
+template <>
+__device__ half2 add_func<half2>(half2 a, half2 b) {
+#if __CUDA_ARCH__ >= 530
+  return __hadd2(a, b);
+#else
+  return half2(__float2half(__half2float(a.x) + __half2float(b.x)),
+               __float2half(__half2float(b.x) + __half2float(b.y)));
+#endif
+}
+
+template <>
+__device__ half add_func<half>(half a, half b) {
+#if __CUDA_ARCH__ >= 530
+  return __hadd(a, b);
+#else
+  return __float2half(__half2float(a) + __half2float(b));
+#endif
+}
+#endif
+
+template <typename T>
+__global__ void TransposeQkvKernel(const int H,
+                                   const T *input,
+                                   const T *bias,
+                                   T *output) {
+  // Input: BxSx3xNxH
+  // Bias: 3xNxH
+  // Output: 3xBxNxSxH
+  int n = threadIdx.y;
+  int s = blockIdx.x;
+  int b = blockIdx.y;
+  int m = blockIdx.z;
+
+  const int N = blockDim.y;
+  const int S = gridDim.x;
+  const int B = gridDim.y;
+
+  const int NH = N * H;
+  const int NHS = NH * S;
+  const int in_offset = n * H + m * NH + s * 3 * NH + b * NHS * 3;
+  const int bias_offset = m * NH + n * H;
+  const int out_offset = s * H + n * S * H + b * NHS + m * NHS * B;
+
+  const int i = threadIdx.x;
+  output[out_offset + i] =
+      add_func(input[in_offset + i], bias[bias_offset + i]);
+}
+
+template <typename T>
+void TransQKVWithBias(const int batch,
+                      const int seq_len,
+                      const int head_size,
+                      const int head_num,
+                      const T *input,
+                      const T *bias,
+                      T *output,
+                      gpuStream_t stream);
+
+template <>
+void TransQKVWithBias(const int batch,
+                      const int seq_len,
+                      const int head_size,
+                      const int head_num,
+                      const float *input,
+                      const float *bias,
+                      float *output,
+                      gpuStream_t stream) {
+  // BxSx3xNxH + 3xNxH -> 3xBxNxSxH
+  int scratch_size = batch * head_num * seq_len * seq_len;
+  const dim3 grid(seq_len, batch, 3);
+  // scratch % 4 == 0 to ensure the alignment
+  if (head_size % 4 == 0 && scratch_size % 4 == 0) {
+    const int h = head_size / 4;
+    const float4 *input4 = reinterpret_cast<const float4 *>(input);
+    const float4 *bias4 = reinterpret_cast<const float4 *>(bias);
+    float4 *output4 = reinterpret_cast<float4 *>(output);
+    const dim3 block(h, head_num, 1);
+
+    // limit h * head_num to max block size(1024).
+    PADDLE_ENFORCE_LE(h * head_num,
+                      1024,
+                      common::errors::InvalidArgument(
+                          "head_num (%d) * head_size (%d) should <= %d",
+                          head_num,
+                          head_size,
+                          1024 * 4));
+    TransposeQkvKernel<float4>
+        <<<grid, block, 0, stream>>>(h, input4, bias4, output4);
+  } else if (head_size % 2 == 0 && scratch_size % 2 == 0) {
+    const int h = head_size / 2;
+    const float2 *input2 = reinterpret_cast<const float2 *>(input);
+    const float2 *bias2 = reinterpret_cast<const float2 *>(bias);
+    float2 *output2 = reinterpret_cast<float2 *>(output);
+    const dim3 block(h, head_num, 1);
+    // limit h * head_num to max block size(1024).
+    PADDLE_ENFORCE_LE(h * head_num,
+                      1024,
+                      common::errors::InvalidArgument(
+                          "head_num (%d) * head_size (%d) should <= %d",
+                          head_num,
+                          head_size,
+                          1024 * 2));
+    TransposeQkvKernel<float2>
+        <<<grid, block, 0, stream>>>(h, input2, bias2, output2);
+  } else {
+    const dim3 block(head_size, head_num, 1);
+    // limit head_size * head_num to max block size(1024).
+    PADDLE_ENFORCE_LE(head_size * head_num,
+                      1024,
+                      common::errors::InvalidArgument(
+                          "head_num (%d) * head_size (%d) should <= %d",
+                          head_num,
+                          head_size,
+                          1024));
+    TransposeQkvKernel<float>
+        <<<grid, block, 0, stream>>>(head_size, input, bias, output);
+  }
+}
+
+#if defined(PADDLE_WITH_CUDA)
+template <>
+void TransQKVWithBias(const int batch,
+                      const int seq_len,
+                      const int head_size,
+                      const int head_num,
+                      const phi::float16 *input,
+                      const phi::float16 *bias,
+                      phi::float16 *output,
+                      gpuStream_t stream) {
+  // BxSx3xNxH + 3xNxH -> 3xBxNxSxH
+  int scratch_size = batch * head_num * seq_len * seq_len;
+  const dim3 grid(seq_len, batch, 3);
+  if (head_size % 2 == 0 && scratch_size % 2 == 0) {
+    const int h = head_size / 2;
+    const half2 *input2 = reinterpret_cast<const half2 *>(input);
+    const half2 *bias2 = reinterpret_cast<const half2 *>(bias);
+    half2 *output2 = reinterpret_cast<half2 *>(output);
+    const dim3 block(h, head_num, 1);
+    // limit h * head_num to max block size(1024).
+    PADDLE_ENFORCE_LE(h * head_num,
+                      1024,
+                      common::errors::InvalidArgument(
+                          "head_num (%d) * head_size (%d) should <= %d",
+                          head_num,
+                          head_size,
+                          1024 * 2));
+    TransposeQkvKernel<half2>
+        <<<grid, block, 0, stream>>>(h, input2, bias2, output2);
+  } else {
+    const dim3 block(head_size, head_num, 1);
+    const half *input_half = reinterpret_cast<const half *>(input);
+    const half *bias_half = reinterpret_cast<const half *>(bias);
+    half *output_half = reinterpret_cast<half *>(output);
+
+    // limit head_size * head_num to max block size(1024).
+    PADDLE_ENFORCE_LE(head_size * head_num,
+                      1024,
+                      common::errors::InvalidArgument(
+                          "head_num (%d) * head_size (%d) should <= %d",
+                          head_num,
+                          head_size,
+                          1024));
+    TransposeQkvKernel<half><<<grid, block, 0, stream>>>(
+        head_size, input_half, bias_half, output_half);
+  }
+}
+#endif
+
+inline int round_up(int seq_len, int multiple = 32) {
+  PADDLE_ENFORCE_GT(
+      multiple,
+      0,
+      common::errors::InvalidArgument(
+          "multiple should be a positive number, but it's (%d)", multiple));
+  return ((seq_len + multiple - 1) / multiple) * multiple;
+}
+
+template <typename T>
+__global__ void broadcast(const T *src,
+                          T *dst,
+                          const int seq_len,
+                          const int head_num) {
+  int batch_id = blockIdx.x / (head_num * seq_len);
+  int dst_offset = blockIdx.x * seq_len;
+  if (threadIdx.x < seq_len) {
+    dst[threadIdx.x + dst_offset] = src[threadIdx.x + batch_id * seq_len];
+  }
+}
+
+template <typename T>
+__global__ void broadcast_batch_head_number(const T *src,
+                                            T *dst,
+                                            const int batch_size,
+                                            const int seq_len,
+                                            const int head_num) {
+  int src_seq_id = blockIdx.x % seq_len;
+  int dst_offset = blockIdx.x * seq_len;
+  if (threadIdx.x < seq_len) {
+    dst[threadIdx.x + dst_offset] = src[threadIdx.x + src_seq_id * seq_len];
+  }
+}
+
+template <typename T, typename Context>
+void MultiheadMatmulKernel(const Context &dev_ctx,
+                           const DenseTensor &input,
+                           const DenseTensor &w,
+                           const DenseTensor &bias,
+                           const paddle::optional<DenseTensor> &bias_qk,
+                           const bool transpose_q,
+                           const bool transpose_k,
+                           const bool transpose_v,
+                           const float alpha,
+                           const int head_number,
+                           DenseTensor *out) {
+  auto *input_d = input.data<T>();
+  auto *w_d = w.data<T>();
+  auto *bias_d = bias.data<T>();
+  auto *bias_qk_d = bias_qk ? bias_qk->data<T>() : nullptr;
+  T scale = static_cast<T>(alpha);
+
+  // compute q*k with eltadd
+  auto stream = dev_ctx.stream();
+  // should be (B * S * hidden)
+  auto input_dims = input.dims();
+  // shouble be (hidden * 3 * all_head_size)
+  auto w_dims = w.dims();
+  int batch = input_dims[0];
+  int seq_len = input_dims[1];
+  int hidden = input_dims[2];
+  phi::DenseTensor temp_bias_tensor;
+  // if bias_qk is[batch, 1, 1, seq_len], the bias_qk_d need to be broadcasted
+  if (bias_qk && bias_qk->numel() == (batch * seq_len)) {
+    VLOG(4) << "Do broadcasted bias_qk from [batch, 1, 1, seq_len]";
+    temp_bias_tensor.Resize({batch * head_number * seq_len * seq_len});
+    auto *temp_qk_bias = dev_ctx.template Alloc<T>(
+        &temp_bias_tensor, temp_bias_tensor.numel() * sizeof(T));
+    int grid = batch * head_number * seq_len;
+    int block = round_up(seq_len);
+    broadcast<<<grid, block, 0, stream>>>(
+        bias_qk_d, temp_qk_bias, seq_len, head_number);
+    bias_qk_d = static_cast<const T *>(temp_qk_bias);
+  }
+  // if bias_qk is[1, 1, seq_len, seq_len], the bias_qk_d need to be
+  // broadcasted
+  if (bias_qk && bias_qk->numel() == (1 * seq_len * seq_len)) {
+    VLOG(4) << "do broadcasted bias_qk from  [1, 1, seq_len, seq_len]";
+    temp_bias_tensor.Resize({batch * head_number * seq_len * seq_len});
+    auto *temp_qk_bias = dev_ctx.template Alloc<T>(
+        &temp_bias_tensor, temp_bias_tensor.numel() * sizeof(T));
+    int grid = batch * head_number * seq_len;
+    int block = round_up(seq_len);
+    broadcast_batch_head_number<<<grid, block, 0, stream>>>(
+        bias_qk_d, temp_qk_bias, batch, seq_len, head_number);
+    bias_qk_d = static_cast<const T *>(temp_qk_bias);
+  }
+  if (!bias_qk) {
+    int size = batch * head_number * seq_len * seq_len;
+    temp_bias_tensor.Resize({size});
+    auto *temp_qk_bias = dev_ctx.template Alloc<T>(
+        &temp_bias_tensor, temp_bias_tensor.numel() * sizeof(T));
+#ifdef PADDLE_WITH_HIP
+    hipMemset(temp_qk_bias, 0, sizeof(float) * size);
+#else
+    cudaMemset(temp_qk_bias, 0, sizeof(float) * size);
+#endif
+    bias_qk_d = static_cast<const T *>(temp_qk_bias);
+  }
+  int all_head_size = w_dims[2];
+  int head_size = all_head_size / head_number;
+
+  out->Resize({batch, seq_len, all_head_size});
+  auto *output_d = dev_ctx.template Alloc<T>(out, out->numel() * sizeof(T));
+
+  // (B*S, hidden)
+  const phi::DenseTensor input_matrix =
+      phi::ReshapeToMatrix(input, 2 /*x_num_col_dims */);
+  // (hidden, 3 * all_head_size)
+  const phi::DenseTensor w_matrix =
+      phi::ReshapeToMatrix(w, 1 /*y_num_col_dims*/);
+
+  phi::DenseTensor temp_out_tensor;
+  auto temp_out_dims =
+      common::make_ddim({batch, seq_len, 3, head_number, head_size});
+  temp_out_tensor.Resize(
+      {batch * seq_len, common::product(temp_out_dims) / (batch * seq_len)});
+  auto *temp_out_data = dev_ctx.template Alloc<T>(
+      &temp_out_tensor, temp_out_tensor.numel() * sizeof(T));
+
+  // (B * S, hidden) * (hidden, 3 * N * H) -> (B * S * 3 * N * H)
+  auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(dev_ctx);
+  blas.MatMul(input_matrix, w_matrix, &temp_out_tensor);
+  VLOG(2) << "(B * S, hidden) * (hidden, 3 * N * H) -> (B * S * 3 * N * H)";
+  // temp_out_tensor.Resize(temp_out_dims);
+
+  phi::DenseTensor multihead_temp_tensor;
+  // B * head_number * S * S * 1 + B * S * 3 * N * H
+  int scratch_size = batch * head_number * seq_len * seq_len * 1;
+  multihead_temp_tensor.Resize({scratch_size + temp_out_tensor.numel()});
+  auto *multihead_temp_data = dev_ctx.template Alloc<T>(
+      &multihead_temp_tensor, multihead_temp_tensor.numel() * sizeof(T));
+
+  auto *qkptr = multihead_temp_data;
+  auto *tptr = multihead_temp_data + scratch_size;
+
+  // Do the transpose with bias.
+  // BxSx3xNxH => tptr: 3xBxNxSxH.
+  TransQKVWithBias(batch,
+                   seq_len,
+                   head_size,
+                   head_number,
+                   temp_out_data,
+                   bias_d,
+                   tptr,
+                   stream);
+  if (std::is_same<T, phi::float16>::value) {
+    phi::funcs::MultiheadGPUComputeFunctor<half> multihead_compute_func;
+    multihead_compute_func(dev_ctx,
+                           batch,
+                           seq_len,
+                           head_number,
+                           head_size,
+                           reinterpret_cast<half *>(qkptr),
+                           reinterpret_cast<const half *>(bias_qk_d),
+                           false,
+                           reinterpret_cast<half *>(tptr),
+                           __float2half(static_cast<float>(scale)),
+                           __float2half(0.0));
+  } else {
+    phi::funcs::MultiheadGPUComputeFunctor<T> multihead_compute_func;
+    multihead_compute_func(dev_ctx,
+                           batch,
+                           seq_len,
+                           head_number,
+                           head_size,
+                           qkptr,
+                           bias_qk_d,
+                           false,
+                           tptr,
+                           scale,
+                           T(0.0));
+  }
+
+  int grid = batch * head_number * seq_len;
+  int block = head_size;
+  transpose<T><<<grid, block, 0, stream>>>(
+      tptr, output_d, batch, seq_len, head_number, head_size);
+}
+
+}  // namespace fusion
+}  // namespace phi
+
+#if defined(PADDLE_WITH_CUDA)
+PD_REGISTER_PLUGIN_KERNEL(multihead_matmul,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::fusion::MultiheadMatmulKernel,
+                          float,
+                          phi::float16) {}
+#else
+PD_REGISTER_PLUGIN_KERNEL(multihead_matmul,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::fusion::MultiheadMatmulKernel,
+                          float) {}
+#endif
diff --git a/backends/metax_gpu/kernels/funcs/generator.cc b/backends/metax_gpu/kernels/funcs/generator.cc
new file mode 100644
index 00000000000..8fcbf474b07
--- /dev/null
+++ b/backends/metax_gpu/kernels/funcs/generator.cc
@@ -0,0 +1,287 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/generator.h"
+
+#include <glog/logging.h>
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/xpu/xpu_info.h"
+#include "paddle/phi/core/enforce.h"
+
+static uint64_t GetRandomSeed() {
+  std::random_device rd;
+  // double has 53 bit significant, so limit uint64 to 53 bits
+  return ((((uint64_t)rd()) << 32) + rd()) & 0x1FFFFFFFFFFFFF;
+}
+
+namespace phi {
+
+const std::shared_ptr<Generator>& DefaultXPUGenerator(int64_t device_id) {
+#if defined(PADDLE_WITH_XPU)
+
+  static int64_t num_xpu_devices = -1;
+  static std::once_flag num_devices_init_flag;
+  static std::deque<std::once_flag> xpu_device_flags;
+  static std::vector<std::shared_ptr<Generator>> default_xpu_generators;
+
+  std::call_once(num_devices_init_flag, []() {
+    num_xpu_devices = phi::backends::xpu::GetXPUDeviceCount();
+    xpu_device_flags.resize(num_xpu_devices);
+    default_xpu_generators.resize(num_xpu_devices);
+  });
+  if (device_id < 0) {
+    PADDLE_THROW(common::errors::InvalidArgument(
+        "xpu device id should be greater than 0"));
+  }
+
+  std::call_once(xpu_device_flags[device_id], [device_id]() {
+    default_xpu_generators[device_id] =
+        std::make_shared<Generator>(GetRandomSeed(), device_id);
+    VLOG(4) << "initial seed: "
+            << default_xpu_generators[device_id]->GetCurrentSeed();
+  });
+  return default_xpu_generators[device_id];
+#else
+  PADDLE_THROW(common::errors::PermissionDenied(
+      "getDefaultXPUGenerator only support in XPU place"));
+#endif
+}
+
+const std::shared_ptr<Generator>& DefaultCUDAGenerator(int64_t device_id) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+  static int64_t num_cuda_devices = -1;
+  static std::once_flag num_devices_init_flag;
+  static std::deque<std::once_flag> cuda_device_flags;
+  static std::vector<std::shared_ptr<Generator>> default_cuda_generators;
+
+  std::call_once(num_devices_init_flag, []() {
+    num_cuda_devices = phi::backends::gpu::GetGPUDeviceCount();
+    cuda_device_flags.resize(num_cuda_devices);
+    default_cuda_generators.resize(num_cuda_devices);
+  });
+  if (device_id < 0) {
+    PADDLE_THROW(common::errors::InvalidArgument(
+        "cuda device id should be greater than 0"));
+  }
+
+  std::call_once(cuda_device_flags[device_id], [device_id]() {
+    default_cuda_generators[device_id] =
+        std::make_shared<Generator>(GetRandomSeed(), device_id);
+    VLOG(7) << "initial seed: "
+            << default_cuda_generators[device_id]->GetCurrentSeed();
+  });
+  return default_cuda_generators[device_id];
+#else
+  PADDLE_THROW(common::errors::PermissionDenied(
+      "getDefaultCUDAGenerator only support in CUDA place"));
+#endif
+}
+
+const std::shared_ptr<Generator>& DefaultCPUGenerator() {
+  static auto default_cpu_generator =
+      std::make_shared<Generator>(GetRandomSeed());
+  return default_cpu_generator;
+}
+
+const std::shared_ptr<Generator>& DefaultCustomDeviceGenerator(
+    const phi::CustomPlace& place) {
+  static std::
+      unordered_map<phi::Place, std::shared_ptr<Generator>, phi::Place::Hash>
+          generators;
+  if (generators.find(place) == generators.end()) {
+    generators.insert({place, std::make_shared<Generator>(GetRandomSeed())});
+  }
+  return generators[place];
+}
+
+using RNGMap = std::unordered_map<std::string, std::shared_ptr<Generator>>;
+
+static RNGMap& GetRandomSeedGeneratorMap() {
+  static auto random_seed_generator_map = RNGMap();
+  return random_seed_generator_map;
+}
+
+const std::shared_ptr<Generator>& SetRandomSeedGenerator(
+    const std::string& name, uint64_t seed) {
+  auto& rng_map = GetRandomSeedGeneratorMap();
+  auto iter = rng_map.find(name);
+  PADDLE_ENFORCE_EQ(iter == rng_map.end(),
+                    true,
+                    common::errors::AlreadyExists(
+                        "%s RandomSeedGenerator is already exist", name));
+
+  auto generator = std::make_shared<Generator>(seed);
+  bool emplace_success = rng_map.emplace(name, generator).second;
+  PADDLE_ENFORCE_EQ(
+      emplace_success,
+      true,
+      common::errors::PermissionDenied(
+          "SetRandomSeedGenerator cannot emplace %s RandomSeedGenerator",
+          name));
+  return rng_map[name];
+}
+
+const std::shared_ptr<Generator>& GetRandomSeedGenerator(
+    const std::string& name) {
+  auto& rng_map = GetRandomSeedGeneratorMap();
+  auto iter = rng_map.find(name);
+  PADDLE_ENFORCE_EQ(iter != rng_map.end(),
+                    true,
+                    common::errors::NotFound(
+                        "%s RandomSeedGenerator is not found, please "
+                        "use `set_random_seed_generator` to set rng first",
+                        name));
+  return iter->second;
+}
+
+// There are 3 conditions:
+// (1) op seed is set, use op seed.
+// (2) op seed is not set, global seed is set, use global seed.
+// (3) op seed is not set, global seed is not set too, use random seed from
+// RandomGenerator.
+std::shared_ptr<std::mt19937_64> GetCPURandomEngine(uint64_t seed) {
+  if (seed == 0) {
+    VLOG(4) << "Use random cpu_engine from generator";
+    return DefaultCPUGenerator()->GetCPUEngine();
+  } else {
+    // NOTE(zhiqiu): creating an cpu_engine instance everytime instead of using
+    // OpDefaultCPUEngine(), this is the legacy behavior of random operators.
+    // The benefit is that when running PE with fixed-seed in multiple threads,
+    // each thread has their own cpu_engine, and doesn't affect each other.
+    //
+    // And we need to measure the determinacy of Generator in PE.
+    auto cpu_engine = std::make_shared<std::mt19937_64>();
+    static std::mutex mu_;
+    {
+      std::lock_guard<std::mutex> lock(mu_);
+      cpu_engine->seed(seed);
+    }
+    return cpu_engine;
+  }
+}
+
+inline void Generator::print_state_info() {
+  VLOG(7) << "Generator Random state "
+          << "device id: " << state().device << ", seed: " << state().seed
+          << ", offset: " << state().offset << ", cpu_engine: " << cpu_engine();
+}
+
+Generator::Generator() {
+  auto seed = GetRandomSeed();
+  current_index = states_.size();
+  states_.emplace_back(-1, seed);
+  print_state_info();
+}
+
+Generator::Generator(uint64_t seed) {
+  current_index = states_.size();
+  states_.emplace_back(-1, seed);
+  print_state_info();
+}
+
+Generator::Generator(uint64_t seed, int64_t device_id) {
+  current_index = states_.size();
+  // device id first, then seed
+  states_.emplace_back(device_id, seed);
+  print_state_info();
+}
+
+phi::Generator::GeneratorState Generator::GetState() { return state(); }
+
+void Generator::SetState(const phi::Generator::GeneratorState& state) {
+  std::lock_guard<std::mutex> lock(mu_);
+  if (current_index < states_.size())
+    states_[current_index] = state;
+  else
+    PADDLE_THROW(common::errors::NotFound("Generator index is not found"));
+  print_state_info();
+}
+
+uint64_t Generator::GetStateIndex() { return current_index; }
+
+void Generator::SetStateIndex(uint64_t StateIndex) {
+  std::lock_guard<std::mutex> lock(mu_);
+  if (current_index < states_.size())
+    current_index = StateIndex;
+  else
+    PADDLE_THROW(common::errors::NotFound("Generator index is not found"));
+}
+
+uint64_t Generator::RegisterStateIndex(const GeneratorState& state) {
+  std::lock_guard<std::mutex> lock(mu_);
+  auto new_index = states_.size();
+  states_.push_back(state);
+  current_index = new_index;
+  return new_index;
+}
+
+inline Generator::GeneratorState& Generator::state() {
+  if (current_index < states_.size())
+    return states_[current_index];
+  else
+    PADDLE_THROW(common::errors::NotFound("Generator index is not found"));
+}
+
+inline std::shared_ptr<std::mt19937_64> Generator::cpu_engine() {
+  return state().cpu_engine;
+}
+
+uint64_t Generator::GetCurrentSeed() {
+  std::lock_guard<std::mutex> lock(mu_);
+  return state().seed;
+}
+
+uint64_t Generator::Seed() {
+  std::lock_guard<std::mutex> lock(mu_);
+  uint64_t seed = GetRandomSeed();
+  state().reset(seed);
+  return seed;
+}
+
+void Generator::SetCurrentSeed(uint64_t seed) {
+  std::lock_guard<std::mutex> lock(mu_);
+  state().reset(seed);
+}
+
+std::shared_ptr<std::mt19937_64> Generator::GetCPUEngine() {
+  return cpu_engine();
+}
+
+uint64_t Generator::Random64() {
+  std::lock_guard<std::mutex> lock(mu_);
+  auto current_engine = cpu_engine();
+  return (*current_engine)();
+}
+
+std::pair<uint64_t, uint64_t> Generator::IncrementOffset(uint64_t increment) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_CUSTOM_DEVICE) || defined(PADDLE_WITH_XPU)
+  std::lock_guard<std::mutex> lock(mu_);
+  uint64_t offset = state().offset;
+  state().offset = offset + increment;
+  print_state_info();
+  return std::make_pair(state().seed, offset);
+#else
+  PADDLE_THROW(common::errors::PermissionDenied(
+      "Increment Offset only support in CUDA place"));
+#endif
+}
+
+}  // namespace phi
diff --git a/backends/metax_gpu/kernels/impl/gammaln_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/gammaln_grad_kernel_impl.h
new file mode 100644
index 00000000000..2b222ba3b2c
--- /dev/null
+++ b/backends/metax_gpu/kernels/impl/gammaln_grad_kernel_impl.h
@@ -0,0 +1,112 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+
+namespace phi {
+template <typename T>
+HOSTDEVICE T digamma_positive_domain(T x) {
+  constexpr T c = T{8.5};
+  constexpr T euler_mascheroni = T{0.57721566490153286060};
+  T r;
+  T value;
+  T x2;
+
+  if (x <= T{0.000001}) {
+    value = -euler_mascheroni - T{1.0} / x + T{1.6449340668482264365} * x;
+    return value;
+  }
+
+  value = T{0.0};
+  x2 = x;
+  while (x2 < c) {
+    value = value - T{1.0} / x2;  // NOLINT
+    x2 = x2 + T{1.0};
+  }
+
+  r = T{1.0} / x2;
+  value = value + std::log(x2) - T{0.5} * r;
+
+  r = r * r;
+
+  value = value -
+          r * (T{1.0} / T{12.0} -
+               r * (T{1.0} / T{120.0} -
+                    r * (T{1.0} / T{252.0} -
+                         r * (T{1.0} / T{240.0} - r * (T{1.0} / T{132.0})))));
+
+  return value;
+}
+
+template <typename T>
+HOSTDEVICE T digamma(T x) {
+  const static T pi = T{3.14159265358979323846};  // NOLINT
+
+  if (x == T{0.0}) {
+    T inf = std::numeric_limits<T>::infinity();
+    return std::signbit(x) ? inf : -inf;
+  } else if (x < T{0.0}) {
+    if (x == std::trunc(x)) {
+      return std::numeric_limits<T>::quiet_NaN();
+    } else {
+      T iptr;
+      T frac_part = std::modf(x, &iptr);
+      return digamma_positive_domain(T{1.0} - x) -
+             pi / std::tan(pi * frac_part);
+    }
+  } else {
+    return digamma_positive_domain(x);
+  }
+}
+
+template <typename T>
+struct GammalnGradFunctor {
+  GammalnGradFunctor(const T* dout, const T* x, T* output, int64_t numel)
+      : dout_(dout), x_(x), output_(output), numel_(numel) {}
+
+  HOSTDEVICE void operator()(int64_t idx) const {
+    using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+    const MT mp_dout = static_cast<MT>(dout_[idx]);
+    const MT mp_x = static_cast<MT>(x_[idx]);
+    output_[idx] = static_cast<T>(mp_dout * digamma<MT>(mp_x));
+  }
+
+ private:
+  const T* dout_;
+  const T* x_;
+  T* output_;
+  int64_t numel_;
+};
+template <typename T, typename Context>
+void GammalnGradKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& d_out,
+                       DenseTensor* d_x) {
+  auto numel = d_out.numel();
+  if (d_x && d_x->numel() == 0) {
+    dev_ctx.template Alloc<T>(d_x);
+    return;
+  }
+  auto* dout_data = d_out.data<T>();
+  auto* x_data = x.data<T>();
+  auto* dx_data =
+      dev_ctx.template Alloc<T>(d_x, static_cast<size_t>(numel * sizeof(T)));
+  phi::funcs::ForRange<Context> for_range(dev_ctx, numel);
+  GammalnGradFunctor<T> functor(dout_data, x_data, dx_data, numel);
+  for_range(functor);
+}
+}  // namespace phi
diff --git a/backends/metax_gpu/kernels/metax_kernel/cudnn_lstm_grad_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/cudnn_lstm_grad_kernel.cu
new file mode 100644
index 00000000000..766d984a25b
--- /dev/null
+++ b/backends/metax_gpu/kernels/metax_kernel/cudnn_lstm_grad_kernel.cu
@@ -0,0 +1,362 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "kernels/metax_kernel/metax_context.h"  //NOLINT
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cudnn_lstm_grad_kernel.h"
+#include "paddle/phi/kernels/gpu/cudnn_lstm_utils.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CudnnLSTMGradKernel(
+    const Context &dev_ctx,
+    const DenseTensor &x,
+    const DenseTensor &init_h,
+    const DenseTensor &init_c,
+    const paddle::optional<std::vector<const DenseTensor *>> &weight_list,
+    const paddle::optional<DenseTensor> &sequence_length,
+    const DenseTensor &out,
+    const DenseTensor &reserve,
+    const DenseTensor &state_out,
+    const DenseTensor &out_grad,
+    const DenseTensor &last_h_grad,
+    const DenseTensor &last_c_grad,
+    float dropout_prob,
+    bool is_bidirec,
+    int hidden_size,
+    int num_layers,
+    bool is_test,
+    int seed,
+    DenseTensor *x_grad,
+    DenseTensor *init_h_grad,
+    DenseTensor *init_c_grad,
+    std::vector<DenseTensor *> weight_grad_list) {
+  auto input_dims = x.dims();
+  auto init_h_dims = init_h.dims();
+  auto init_c_dims = init_c.dims();
+
+  auto *init_h_data = init_h.data<T>();
+  auto *init_c_data = init_c.data<T>();
+  auto *out_data = out.data<T>();
+  auto *out_grad_data = out_grad.data<T>();
+  auto *last_h_grad_data = last_h_grad.data<T>();
+  auto *last_c_grad_data = last_c_grad.data<T>();
+
+  auto running_weight_list = *weight_list.get_ptr();
+  int weight_numel = size_sum(running_weight_list);
+  bool continuous = is_continuous<T, std::vector<const phi::DenseTensor *>>(
+      running_weight_list);
+
+  // auto handle = dev_ctx.cudnn_handle();
+  auto handle = GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace());
+  auto place = dev_ctx.GetPlace();
+  auto stream = dev_ctx.stream();
+  phi::DenseTensor weight_whole;
+  T *weight_data = nullptr;
+
+  if (!continuous) {
+    weight_whole.Resize({weight_numel});
+    dev_ctx.template Alloc<T>(&weight_whole);
+    weight_to_tensor<T>(place, stream, running_weight_list, &weight_whole);
+    weight_data = weight_whole.data<T>();
+  } else {
+    weight_data = const_cast<T *>(running_weight_list[0]->data<T>());
+  }
+
+  phi::DenseTensor weight_grad;
+  phi::funcs::SetConstant<phi::GPUContext, T> zero;
+  weight_grad.Resize({weight_numel});
+  dev_ctx.template Alloc<T>(&weight_grad);
+  zero(dev_ctx, &weight_grad, static_cast<T>(0.0));
+  T *weight_grad_data = weight_grad.data<T>();
+
+  int offset = 0;
+  for (size_t i = 0; i < weight_grad_list.size(); ++i) {
+    size_t len = weight_grad_list[i]->numel();
+    auto dim = weight_grad_list[i]->dims();
+    weight_grad_list[i]
+        ->ShareDataWith(weight_grad.Slice(static_cast<int64_t>(offset),
+                                          static_cast<int64_t>(offset + len)))
+        .Resize(dim);
+    offset += len;
+  }
+
+  x_grad->Resize(input_dims);
+  dev_ctx.template Alloc<T>(x_grad);
+  auto *in_grad_data = x_grad->data<T>();
+
+  if (init_h_grad) {
+    init_h_grad->Resize(init_h_dims);
+    dev_ctx.template Alloc<T>(init_h_grad);
+  }
+  auto *init_h_grad_data = init_h_grad ? init_h_grad->data<T>() : nullptr;
+
+  if (init_c_grad) {
+    init_c_grad->Resize(init_c_dims);
+    dev_ctx.template Alloc<T>(init_c_grad);
+  }
+  auto *init_c_grad_data = init_c_grad ? init_c_grad->data<T>() : nullptr;
+
+  auto running_seq_length = sequence_length.get_ptr();
+  bool has_seq_length = running_seq_length != nullptr;
+  std::vector<int> SequenceLength;
+  if (has_seq_length) {
+    SequenceLength = phi::GetVectorFromTensor<int>(running_seq_length);
+  }
+
+  int seq_length = input_dims[0];
+  int batch_size = x.dims()[1];
+  int input_size = x.dims()[2];
+
+  size_t workspace_size;
+  size_t reserve_size;
+
+  ScopedRNNBase rnn(seq_length,
+                    batch_size,
+                    input_size,
+                    hidden_size,
+                    num_layers,
+                    dropout_prob,
+                    seed,
+                    weight_numel,
+                    true,
+                    is_bidirec);
+
+  rnn.Create<T>(handle,
+                dev_ctx.GetPlace(),
+                SequenceLength,
+                &workspace_size,
+                &reserve_size,
+                const_cast<phi::DenseTensor *>(&state_out));
+
+  phi::DenseTensor workspace_data_;
+  workspace_data_.Resize({static_cast<int64_t>(workspace_size)});
+  dev_ctx.template Alloc<uint8_t>(&workspace_data_);
+  const uint8_t *reserve_data = reserve.data<uint8_t>();
+
+#if CUDNN_VERSION >= 90000
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNBackwardData_v8(
+      handle,
+      rnn.rnn_desc(),
+      nullptr,
+      rnn.y_seq_desc(),
+      out_data,
+      out_grad_data,
+      rnn.x_seq_desc(),
+      in_grad_data,
+      rnn.init_h_desc(),
+      init_h_data,
+      last_h_grad_data,
+      init_h_grad_data,
+      rnn.init_c_desc(),
+      init_c_data,
+      last_c_grad_data,
+      init_c_grad_data,
+      rnn.weights_size(),
+      weight_data,
+      workspace_size,
+      workspace_data_.data<uint8_t>(),
+      reserve_size,
+      const_cast<uint8_t *>(reserve_data)));
+
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNBackwardWeights_v8(
+      handle,
+      rnn.rnn_desc(),
+      CUDNN_WGRAD_MODE_ADD,
+      nullptr,
+      rnn.x_seq_desc(),
+      x.data<T>(),
+      rnn.init_h_desc(),
+      init_h.data<T>(),
+      rnn.y_seq_desc(),
+      out.data<T>(),
+      rnn.weights_size(),
+      weight_grad_data,
+      workspace_size,
+      workspace_data_.data<uint8_t>(),
+      reserve_size,
+      const_cast<uint8_t *>(reserve_data)));
+#else
+
+  if (!has_seq_length) {
+// This interface is used when the input/output is unpadded.
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::miopenRNNBackwardData(handle,
+                                            rnn.rnn_desc(),
+                                            seq_length,
+                                            rnn.y_descs(),
+                                            out_data,
+                                            rnn.y_descs(),
+                                            out_grad_data,
+                                            rnn.last_h_desc(),
+                                            last_h_grad_data,
+                                            rnn.last_c_desc(),
+                                            last_c_grad_data,
+                                            rnn.weight_desc(),
+                                            weight_data,
+                                            rnn.init_h_desc(),
+                                            init_h_data,
+                                            rnn.init_c_desc(),
+                                            init_c_data,
+                                            rnn.x_descs(),
+                                            in_grad_data,
+                                            rnn.init_h_desc(),
+                                            init_h_grad_data,
+                                            rnn.init_c_desc(),
+                                            init_c_grad_data,
+                                            workspace_data_.data<uint8_t>(),
+                                            workspace_size,
+                                            const_cast<uint8_t *>(reserve_data),
+                                            reserve_size));
+
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenRNNBackwardWeights(
+        handle,
+        rnn.rnn_desc(),
+        seq_length,
+        rnn.x_descs(),
+        x.data<T>(),
+        rnn.init_h_desc(),
+        init_h.data<T>(),
+        rnn.y_descs(),
+        out.data<T>(),
+        rnn.weight_desc(),
+        weight_grad_data,
+        workspace_data_.data<uint8_t>(),
+        workspace_size,
+        const_cast<uint8_t *>(reserve_data),
+        reserve_size));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnRNNBackwardData(handle,
+                                           rnn.rnn_desc(),
+                                           seq_length,
+                                           rnn.y_descs(),
+                                           out_data,
+                                           rnn.y_descs(),
+                                           out_grad_data,
+                                           rnn.last_h_desc(),
+                                           last_h_grad_data,
+                                           rnn.last_c_desc(),
+                                           last_c_grad_data,
+                                           rnn.weight_desc(),
+                                           weight_data,
+                                           rnn.init_h_desc(),
+                                           init_h_data,
+                                           rnn.init_c_desc(),
+                                           init_c_data,
+                                           rnn.x_descs(),
+                                           in_grad_data,
+                                           rnn.init_h_desc(),
+                                           init_h_grad_data,
+                                           rnn.init_c_desc(),
+                                           init_c_grad_data,
+                                           workspace_data_.data<uint8_t>(),
+                                           workspace_size,
+                                           const_cast<uint8_t *>(reserve_data),
+                                           reserve_size));
+
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNBackwardWeights(
+        handle,
+        rnn.rnn_desc(),
+        seq_length,
+        rnn.x_descs(),
+        x.data<T>(),
+        rnn.init_h_desc(),
+        init_h.data<T>(),
+        rnn.y_descs(),
+        out.data<T>(),
+        workspace_data_.data<uint8_t>(),
+        workspace_size,
+        rnn.weight_desc(),
+        weight_grad_data,
+        const_cast<uint8_t *>(reserve_data),
+        reserve_size));
+#endif
+  } else {
+#if !defined(PADDLE_WITH_HIP) && CUDNN_VERSION >= 7201
+    // for train
+    // This interface is used when the input/output is padded.
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNBackwardDataEx(
+        handle,
+        rnn.rnn_desc(),
+        rnn.y_seq_desc(),
+        out_data,
+        rnn.y_seq_desc(),
+        out_grad_data,
+        nullptr,
+        nullptr,
+        rnn.last_h_desc(),
+        last_h_grad_data,
+        rnn.last_c_desc(),
+        last_c_grad_data,
+        rnn.weight_desc(),
+        weight_data,
+        rnn.init_h_desc(),
+        init_h_data,
+        rnn.init_c_desc(),
+        init_c_data,
+        rnn.x_seq_desc(),
+        in_grad_data,
+        rnn.init_h_desc(),
+        init_h_grad_data,
+        rnn.init_c_desc(),
+        init_c_grad_data,
+        nullptr,
+        nullptr,
+        workspace_data_.data<uint8_t>(),
+        workspace_size,
+        const_cast<uint8_t *>(reserve_data),
+        reserve_size));
+
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNBackwardWeightsEx(
+        handle,
+        rnn.rnn_desc(),
+        rnn.x_seq_desc(),
+        x.data<T>(),
+        rnn.init_h_desc(),
+        init_h.data<T>(),
+        rnn.y_seq_desc(),
+        out.data<T>(),
+        workspace_data_.data<uint8_t>(),
+        workspace_size,
+        rnn.weight_desc(),
+        weight_grad_data,
+        const_cast<uint8_t *>(reserve_data),
+        reserve_size));
+#else
+    PADDLE_THROW(common::errors::Unavailable(
+        "The padded input of rnn is supported by cudnnRNNBackwardDataEx, "
+        "cudnnRNNBackwardWeightsEx, but it only works when the version "
+        "of cudnn is larger than 7.2.1"));
+#endif
+  }
+
+#endif  // end CUDNN_VERSION >= 90000
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(
+    cudnn_lstm_grad, GPU, ALL_LAYOUT, phi::CudnnLSTMGradKernel, float) {}
+#else
+PD_REGISTER_PLUGIN_KERNEL(cudnn_lstm_grad,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::CudnnLSTMGradKernel,
+                          float,
+                          double) {}
+#endif
diff --git a/backends/metax_gpu/kernels/metax_kernel/cudnn_lstm_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/cudnn_lstm_kernel.cu
new file mode 100644
index 00000000000..6bb94c9281a
--- /dev/null
+++ b/backends/metax_gpu/kernels/metax_kernel/cudnn_lstm_kernel.cu
@@ -0,0 +1,428 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "glog/logging.h"
+#include "kernels/metax_kernel/metax_context.h"  //NOLINT
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cudnn_lstm_kernel.h"
+#include "paddle/phi/kernels/gpu/cudnn_lstm_utils.h"
+
+namespace phi {
+
+template <typename T>
+#ifdef PADDLE_WITH_HIP
+void LSTMInference(const bool &has_seq_length,
+                   const miopenHandle_t &handle,
+#else
+void LSTMInference(const bool &has_seq_length,
+                   const cudnnHandle_t &handle,
+#endif
+                   const int &seq_length,
+                   ScopedRNNBase *rnn,
+                   const T *x_data,
+                   const T *init_h_data,
+                   const T *init_c_data,
+                   const T *w_data,
+                   T *out_data,
+                   T *last_h_data,
+                   T *last_c_data,
+                   phi::DenseTensor *workspace_data,
+                   const size_t &workspace_size) {
+#if CUDNN_VERSION >= 90000
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cudnnRNNForward(handle,
+                                    rnn->rnn_desc(),
+                                    CUDNN_FWD_MODE_INFERENCE,
+                                    nullptr,
+                                    rnn->x_seq_desc(),
+                                    x_data,
+                                    rnn->y_seq_desc(),
+                                    out_data,
+                                    rnn->init_h_desc(),
+                                    init_h_data,
+                                    last_h_data,
+                                    rnn->init_c_desc(),
+                                    init_c_data,
+                                    last_c_data,
+                                    rnn->weights_size(),
+                                    w_data,
+                                    workspace_size,
+                                    workspace_data->data<uint8_t>(),
+                                    0,
+                                    nullptr));
+
+#else
+
+  if (!has_seq_length) {
+// for inference
+// This interface is used when the input/output is unpadded.
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::miopenRNNForwardInference(handle,
+                                                rnn->rnn_desc(),
+                                                seq_length,
+                                                rnn->x_descs(),
+                                                x_data,
+                                                rnn->init_h_desc(),
+                                                init_h_data,
+                                                rnn->init_c_desc(),
+                                                init_c_data,
+                                                rnn->weight_desc(),
+                                                w_data,
+                                                rnn->y_descs(),
+                                                out_data,
+                                                rnn->last_h_desc(),
+                                                last_h_data,
+                                                rnn->last_c_desc(),
+                                                last_c_data,
+                                                workspace_data->data<uint8_t>(),
+                                                workspace_size));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnRNNForwardInference(handle,
+                                               rnn->rnn_desc(),
+                                               seq_length,
+                                               rnn->x_descs(),
+                                               x_data,
+                                               rnn->init_h_desc(),
+                                               init_h_data,
+                                               rnn->init_c_desc(),
+                                               init_c_data,
+                                               rnn->weight_desc(),
+                                               w_data,
+                                               rnn->y_descs(),
+                                               out_data,
+                                               rnn->last_h_desc(),
+                                               last_h_data,
+                                               rnn->last_c_desc(),
+                                               last_c_data,
+                                               workspace_data->data<uint8_t>(),
+                                               workspace_size));
+#endif
+  } else {
+#if !defined(PADDLE_WITH_HIP) && CUDNN_VERSION >= 7201
+    // for inference
+    // This interface is used when the input/output is padded.
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNForwardInferenceEx(
+        handle,
+        rnn->rnn_desc(),
+        rnn->x_seq_desc(),
+        x_data,
+        rnn->init_h_desc(),
+        init_h_data,
+        rnn->init_c_desc(),
+        init_c_data,
+        rnn->weight_desc(),
+        w_data,
+        rnn->y_seq_desc(),
+        out_data,
+        rnn->last_h_desc(),
+        last_h_data,
+        rnn->last_c_desc(),
+        last_c_data,
+        nullptr,
+        nullptr,
+        nullptr,
+        nullptr,
+        nullptr,
+        nullptr,
+        nullptr,
+        nullptr,
+        workspace_data->data<uint8_t>(),
+        workspace_size));
+#else
+    // CUDNN VERSION has to >=7.2.1
+    PADDLE_THROW(common::errors::Unavailable(
+        "The padded input is supported by "
+        "cudnnRNNForwardInferenceEx, but it only works when "
+        "the version of cudnn is larger than 7.2.1"));
+#endif
+  }
+
+#endif  // end CUDNN_VERSION >= 90000
+}
+
+template <typename T, typename Context>
+void CudnnLSTMKernel(
+    const Context &dev_ctx,
+    const DenseTensor &x,
+    const DenseTensor &init_h,
+    const DenseTensor &init_c,
+    const paddle::optional<DenseTensor> &w,
+    const paddle::optional<std::vector<const DenseTensor *>> &weight_list,
+    const paddle::optional<DenseTensor> &sequence_length,
+    float dropout_prob,
+    bool is_bidirec,
+    int hidden_size,
+    int num_layers,
+    bool is_test,
+    int seed,
+    DenseTensor *out,
+    DenseTensor *last_h,
+    DenseTensor *last_c,
+    DenseTensor *reserve,
+    DenseTensor *state_out) {
+  const T *x_data = x.data<T>();
+  const T *init_h_data = init_h.data<T>();
+  const T *init_c_data = init_c.data<T>();
+
+  T *out_data = dev_ctx.template Alloc<T>(out);
+  T *last_h_data = dev_ctx.template Alloc<T>(last_h);
+  T *last_c_data = dev_ctx.template Alloc<T>(last_c);
+
+  if (!is_test) {
+    if (seed == 0) {
+      // If not specify seed, use global Generator to generate seed.
+      int device_id = dev_ctx.GetPlace().GetDeviceId();
+      auto gen_cuda = phi::DefaultCUDAGenerator(device_id);
+      seed = static_cast<int>(gen_cuda->Random64());
+    }
+  }
+
+  auto *running_sequence_length = sequence_length.get_ptr();
+  bool has_seq_length = running_sequence_length != nullptr;
+  std::vector<int> SequenceLength;
+  if (has_seq_length) {
+    SequenceLength = phi::GetVectorFromTensor<int>(running_sequence_length);
+  }
+
+  // auto handle = dev_ctx.cudnn_handle();
+  auto handle = GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace());
+
+  int seq_length = x.dims()[0];
+  int batch_size = x.dims()[1];
+  int input_size = x.dims()[2];
+  bool state_initialized = state_out->initialized() ? true : false;
+
+  size_t workspace_size;
+  size_t reserve_size;
+  phi::DenseTensor weight_whole;
+  T *w_data = nullptr;
+  int weight_numel;
+  bool w_initialized = false;
+  auto place = dev_ctx.GetPlace();
+  auto stream = dev_ctx.stream();
+  auto *running_w = w.get_ptr();
+  if (is_test && running_w != nullptr) {
+    w_initialized = running_w->initialized() ? true : false;
+    weight_numel = running_w->numel();
+  }
+  if (!w_initialized) {
+    auto running_weight_list = *weight_list.get_ptr();
+    bool continuous = is_continuous<T, std::vector<const phi::DenseTensor *>>(
+        running_weight_list);
+    weight_numel = size_sum(running_weight_list);
+
+    if (!continuous) {
+      LOG_FIRST_N(WARNING, 2)
+          << "If the memory space of the Input WeightList is not continuous, "
+             "less efficient calculation will be called. Please call "
+             "flatten_parameters() to make the input memory continuous.";
+      weight_whole.Resize({weight_numel});
+      dev_ctx.template Alloc<T>(&weight_whole);
+      weight_to_tensor<T>(place, stream, running_weight_list, &weight_whole);
+      w_data = weight_whole.data<T>();
+      if (is_test) {  // maybe also reset small weights' ptr for training
+        int offset = 0;
+        for (size_t i = 0; i < running_weight_list.size(); ++i) {
+          size_t len = running_weight_list[i]->numel();
+          auto dim = running_weight_list[i]->dims();
+          const_cast<phi::DenseTensor *>(running_weight_list[i])
+              ->ShareDataWith(
+                  weight_whole.Slice(static_cast<int64_t>(offset),
+                                     static_cast<int64_t>(offset + len)))
+              .Resize(dim);
+          offset += len;
+        }
+      }
+    } else {
+      w_data = const_cast<T *>(running_weight_list[0]->data<T>());
+    }
+  } else {
+    w_data = const_cast<T *>(running_w->data<T>());
+  }
+
+  ScopedRNNBase rnn(seq_length,
+                    batch_size,
+                    input_size,
+                    hidden_size,
+                    num_layers,
+                    dropout_prob,
+                    seed,
+                    weight_numel,
+                    state_initialized,
+                    is_bidirec);
+  rnn.Create<T>(handle,
+                dev_ctx.GetPlace(),
+                SequenceLength,
+                &workspace_size,
+                &reserve_size,
+                state_out);
+
+  phi::DenseTensor workspace_data_;
+  workspace_data_.Resize({static_cast<int64_t>(workspace_size)});
+  dev_ctx.template Alloc<uint8_t>(&workspace_data_);
+
+  reserve->Resize({static_cast<int64_t>(reserve_size)});
+  auto *reserve_data = dev_ctx.template Alloc<uint8_t>(reserve);
+
+  if (is_test) {
+    LSTMInference<T>(has_seq_length,
+                     handle,
+                     seq_length,
+                     &rnn,
+                     x_data,
+                     init_h_data,
+                     init_c_data,
+                     w_data,
+                     out_data,
+                     last_h_data,
+                     last_c_data,
+                     &workspace_data_,
+                     workspace_size);
+  } else {
+#if CUDNN_VERSION >= 90000
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnRNNForward(handle,
+                                      rnn.rnn_desc(),
+                                      CUDNN_FWD_MODE_TRAINING,
+                                      nullptr,
+                                      rnn.x_seq_desc(),
+                                      x_data,
+                                      rnn.y_seq_desc(),
+                                      out_data,
+                                      rnn.init_h_desc(),
+                                      init_h_data,
+                                      last_h_data,
+                                      rnn.init_c_desc(),
+                                      init_c_data,
+                                      last_c_data,
+                                      rnn.weights_size(),
+                                      w_data,
+                                      workspace_size,
+                                      workspace_data_.data<uint8_t>(),
+                                      reserve_size,
+                                      reserve_data));
+#else
+
+    if (!has_seq_length) {
+// for train
+// This interface is used when the input/output is unpadded.
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenRNNForwardTraining(
+          handle,
+          rnn.rnn_desc(),
+          seq_length,
+          rnn.x_descs(),
+          x_data,
+          rnn.init_h_desc(),
+          init_h_data,
+          rnn.init_c_desc(),
+          init_c_data,
+          rnn.weight_desc(),
+          w_data,
+          rnn.y_descs(),
+          out_data,
+          rnn.last_h_desc(),
+          last_h_data,
+          rnn.last_c_desc(),
+          last_c_data,
+          workspace_data_.data<uint8_t>(),
+          workspace_size,
+          reserve_data,
+          reserve_size));
+#else
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          phi::dynload::cudnnRNNForwardTraining(handle,
+                                                rnn.rnn_desc(),
+                                                seq_length,
+                                                rnn.x_descs(),
+                                                x_data,
+                                                rnn.init_h_desc(),
+                                                init_h_data,
+                                                rnn.init_c_desc(),
+                                                init_c_data,
+                                                rnn.weight_desc(),
+                                                w_data,
+                                                rnn.y_descs(),
+                                                out_data,
+                                                rnn.last_h_desc(),
+                                                last_h_data,
+                                                rnn.last_c_desc(),
+                                                last_c_data,
+                                                workspace_data_.data<uint8_t>(),
+                                                workspace_size,
+                                                reserve_data,
+                                                reserve_size));
+#endif
+    } else {
+#if !defined(PADDLE_WITH_HIP) && CUDNN_VERSION >= 7201
+      // for train
+      // This interface is used when the input/output is padded.
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNForwardTrainingEx(
+          handle,
+          rnn.rnn_desc(),
+          rnn.x_seq_desc(),
+          x_data,
+          rnn.init_h_desc(),
+          init_h_data,
+          rnn.init_c_desc(),
+          init_c_data,
+          rnn.weight_desc(),
+          w_data,
+          rnn.y_seq_desc(),
+          out_data,
+          rnn.last_h_desc(),
+          last_h_data,
+          rnn.last_c_desc(),
+          last_c_data,
+          nullptr,
+          nullptr,
+          nullptr,
+          nullptr,
+          nullptr,
+          nullptr,
+          nullptr,
+          nullptr,
+          workspace_data_.data<uint8_t>(),
+          workspace_size,
+          reserve_data,
+          reserve_size));
+#else
+      PADDLE_THROW(common::errors::Unavailable(
+          "The padded input is supported by "
+          "cudnnRNNForwardTrainingEx, but it only works when "
+          "the version of cudnn is larger than 7.2.1"));
+#endif
+    }
+#endif  // end CUDNN_VERSION >= 90000
+  }
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(cudnn_lstm, GPU, ALL_LAYOUT, phi::CudnnLSTMKernel, float) {
+  kernel->InputAt(5).SetDataType(phi::DataType::INT32);
+  kernel->OutputAt(3).SetDataType(phi::DataType::UINT8);
+  kernel->OutputAt(4).SetDataType(phi::DataType::UINT8);
+}
+#else
+PD_REGISTER_PLUGIN_KERNEL(
+    cudnn_lstm, metax_gpu, ALL_LAYOUT, phi::CudnnLSTMKernel, float, double) {
+  kernel->InputAt(5).SetDataType(phi::DataType::INT32);
+  kernel->OutputAt(3).SetDataType(phi::DataType::UINT8);
+  kernel->OutputAt(4).SetDataType(phi::DataType::UINT8);
+}
+#endif
diff --git a/backends/metax_gpu/tests/ignore.txt b/backends/metax_gpu/tests/ignore.txt
index b4f1afbe5b0..4e54e17b3ef 100644
--- a/backends/metax_gpu/tests/ignore.txt
+++ b/backends/metax_gpu/tests/ignore.txt
@@ -19,3 +19,7 @@ test_uniform_random_op
 test_c_embedding_op
 test_slice_op
 test_compare_op
+test_conv3d_transpose_op
+test_conv3d_layer
+test_conv3d_transpose_part2_op
+test_fused_conv2d_add_act_op

From cccf6b7e68cbaedd28c666773020d094556ab251 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Tue, 30 Sep 2025 14:12:32 +0800
Subject: [PATCH 056/121] [metax] rename yaml file (#77)

* [metax]fix patch and fix missing kernel

* [metax] link mccl and fix missing kernel

* [metax] rename yaml file

---------
---
 .github/workflows/metax_work.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/metax_work.yaml b/.github/workflows/metax_work.yaml
index aff530d475c..f14023848c6 100644
--- a/.github/workflows/metax_work.yaml
+++ b/.github/workflows/metax_work.yaml
@@ -1,4 +1,4 @@
-name: padlle metax gpu test
+name: paddle metax gpu test
 
 on:
   workflow_dispatch:

From 7a7a7a0590eb0b61be1bd7a911f37dfd521cc2ec Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Tue, 30 Sep 2025 14:30:16 +0800
Subject: [PATCH 057/121] [metax] rm file (#78)

* [metax]fix patch and fix missing kernel

* [metax] link mccl and fix missing kernel

* [metax] rename yaml file

* [metax] rm file

* [metax] rm file

---------
---
 .../cuda_kernels/gammaln_grad_kernel.cu       |  28 -----
 .../kernels/impl/gammaln_grad_kernel_impl.h   | 112 ------------------
 .../kernels/metax_kernel/rnn_kernel.cu.cc     |   2 +
 3 files changed, 2 insertions(+), 140 deletions(-)
 delete mode 100644 backends/metax_gpu/kernels/cuda_kernels/gammaln_grad_kernel.cu
 delete mode 100644 backends/metax_gpu/kernels/impl/gammaln_grad_kernel_impl.h

diff --git a/backends/metax_gpu/kernels/cuda_kernels/gammaln_grad_kernel.cu b/backends/metax_gpu/kernels/cuda_kernels/gammaln_grad_kernel.cu
deleted file mode 100644
index c6bd53f007f..00000000000
--- a/backends/metax_gpu/kernels/cuda_kernels/gammaln_grad_kernel.cu
+++ /dev/null
@@ -1,28 +0,0 @@
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "kernels/impl/gammaln_grad_kernel_impl.h"
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/common/amp_type_traits.h"
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/gammaln_grad_kernel.h"
-
-PD_REGISTER_PLUGIN_KERNEL(gammaln_grad,
-                          metax_gpu,
-                          ALL_LAYOUT,
-                          phi::GammalnGradKernel,
-                          float,
-                          double,
-                          phi::float16,
-                          phi::bfloat16) {}
diff --git a/backends/metax_gpu/kernels/impl/gammaln_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/gammaln_grad_kernel_impl.h
deleted file mode 100644
index 2b222ba3b2c..00000000000
--- a/backends/metax_gpu/kernels/impl/gammaln_grad_kernel_impl.h
+++ /dev/null
@@ -1,112 +0,0 @@
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/phi/common/amp_type_traits.h"
-#include "paddle/phi/kernels/funcs/for_range.h"
-
-namespace phi {
-template <typename T>
-HOSTDEVICE T digamma_positive_domain(T x) {
-  constexpr T c = T{8.5};
-  constexpr T euler_mascheroni = T{0.57721566490153286060};
-  T r;
-  T value;
-  T x2;
-
-  if (x <= T{0.000001}) {
-    value = -euler_mascheroni - T{1.0} / x + T{1.6449340668482264365} * x;
-    return value;
-  }
-
-  value = T{0.0};
-  x2 = x;
-  while (x2 < c) {
-    value = value - T{1.0} / x2;  // NOLINT
-    x2 = x2 + T{1.0};
-  }
-
-  r = T{1.0} / x2;
-  value = value + std::log(x2) - T{0.5} * r;
-
-  r = r * r;
-
-  value = value -
-          r * (T{1.0} / T{12.0} -
-               r * (T{1.0} / T{120.0} -
-                    r * (T{1.0} / T{252.0} -
-                         r * (T{1.0} / T{240.0} - r * (T{1.0} / T{132.0})))));
-
-  return value;
-}
-
-template <typename T>
-HOSTDEVICE T digamma(T x) {
-  const static T pi = T{3.14159265358979323846};  // NOLINT
-
-  if (x == T{0.0}) {
-    T inf = std::numeric_limits<T>::infinity();
-    return std::signbit(x) ? inf : -inf;
-  } else if (x < T{0.0}) {
-    if (x == std::trunc(x)) {
-      return std::numeric_limits<T>::quiet_NaN();
-    } else {
-      T iptr;
-      T frac_part = std::modf(x, &iptr);
-      return digamma_positive_domain(T{1.0} - x) -
-             pi / std::tan(pi * frac_part);
-    }
-  } else {
-    return digamma_positive_domain(x);
-  }
-}
-
-template <typename T>
-struct GammalnGradFunctor {
-  GammalnGradFunctor(const T* dout, const T* x, T* output, int64_t numel)
-      : dout_(dout), x_(x), output_(output), numel_(numel) {}
-
-  HOSTDEVICE void operator()(int64_t idx) const {
-    using MT = typename phi::dtype::MPTypeTrait<T>::Type;
-    const MT mp_dout = static_cast<MT>(dout_[idx]);
-    const MT mp_x = static_cast<MT>(x_[idx]);
-    output_[idx] = static_cast<T>(mp_dout * digamma<MT>(mp_x));
-  }
-
- private:
-  const T* dout_;
-  const T* x_;
-  T* output_;
-  int64_t numel_;
-};
-template <typename T, typename Context>
-void GammalnGradKernel(const Context& dev_ctx,
-                       const DenseTensor& x,
-                       const DenseTensor& d_out,
-                       DenseTensor* d_x) {
-  auto numel = d_out.numel();
-  if (d_x && d_x->numel() == 0) {
-    dev_ctx.template Alloc<T>(d_x);
-    return;
-  }
-  auto* dout_data = d_out.data<T>();
-  auto* x_data = x.data<T>();
-  auto* dx_data =
-      dev_ctx.template Alloc<T>(d_x, static_cast<size_t>(numel * sizeof(T)));
-  phi::funcs::ForRange<Context> for_range(dev_ctx, numel);
-  GammalnGradFunctor<T> functor(dout_data, x_data, dx_data, numel);
-  for_range(functor);
-}
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc b/backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc
index 2598ce093e6..fa2c9e6e8b7 100644
--- a/backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc
+++ b/backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc
@@ -181,6 +181,7 @@ void RnnKernel(const Context &dev_ctx,
   else if (mode == "RNN_TANH")
     rnn_mode = miopenRNNTANH;
 #else
+  VLOG(0) << "Leave lstmKernel.11";
   gpuRNNMode_t rnn_mode = CUDNN_LSTM;
   if (mode == "LSTM")
     rnn_mode = CUDNN_LSTM;
@@ -228,6 +229,7 @@ void RnnKernel(const Context &dev_ctx,
                     common::errors::InvalidArgument(
                         "ROCm do not support SequenceLength yet."));
 #endif
+  VLOG(0) << "Leave lstmKernel.12";
   std::vector<int> SequenceLength;
   if (has_seq_length) {
     SequenceLength = phi::GetVectorFromTensor<int>(sequence_length.get_ptr());

From 5a76d35b53e1f7d970d6b388969ba56ae955dc0d Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Tue, 30 Sep 2025 17:18:00 +0800
Subject: [PATCH 058/121]  metax_fix_ci (#79)

* [metax] add Rules

---------
---
 .github/workflows/metax_work.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/metax_work.yaml b/.github/workflows/metax_work.yaml
index f14023848c6..f73442b6fd5 100644
--- a/.github/workflows/metax_work.yaml
+++ b/.github/workflows/metax_work.yaml
@@ -7,6 +7,7 @@ on:
     branches: [develop, release/**]
     paths:
       - "**"
+      - "Paddle/**"
       - "!backends/**"
       - "backends/metax_gpu/**"
 

From ceb55ebf2a0a0398f9fa318b79ac1e41a079a759 Mon Sep 17 00:00:00 2001
From: jxwangmetax <189149612@qq.com>
Date: Sat, 11 Oct 2025 09:45:57 +0800
Subject: [PATCH 059/121] [metax] add print tensor (#91)

* modify cmake for warpctc and warprnnt

* modify conv for tf32 and fp32

* modify conv kernel

* modify library to static library

* modify kernel

* modify fused_bias_dropout_residual_layer_norm

* modify compile

* modify blas

* modify blas

* modify blas

* modify blas

* modify context

* modify kernels

* modify kernels

* modify kernels

* add print tensor
---
 backends/metax_gpu/CMakeLists.txt             |   2 +
 .../flags_declare.cc}                         |  11 +
 backends/metax_gpu/common/utils.cc            | 297 ++++++++++++++++++
 backends/metax_gpu/common/utils.h             |  28 ++
 4 files changed, 338 insertions(+)
 rename backends/metax_gpu/{kernels/metax_kernel/flags_declare.cu => common/flags_declare.cc} (89%)
 create mode 100644 backends/metax_gpu/common/utils.cc
 create mode 100644 backends/metax_gpu/common/utils.h

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index 475074ced89..e357a5e5912 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -648,6 +648,7 @@ file(
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/rms_norm_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/lars_momentum_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/partial_sum_kernel.cu
+  ${PADDLE_SOURCE_DIR}/paddle/phi/backends/gpu/gpu_info.cc
   # ############################################################################
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu
   # kernels/kps
@@ -687,6 +688,7 @@ file(
   RELATIVE ${CMAKE_SOURCE_DIR}
   runtime/runtime.cc
   passes/*.cc
+  common/*.cc
   kernels/*.cc
   kernels/*.cu
   kernels/fusion/*.cc
diff --git a/backends/metax_gpu/kernels/metax_kernel/flags_declare.cu b/backends/metax_gpu/common/flags_declare.cc
similarity index 89%
rename from backends/metax_gpu/kernels/metax_kernel/flags_declare.cu
rename to backends/metax_gpu/common/flags_declare.cc
index d7aefe54e9f..6b497cf9fdf 100644
--- a/backends/metax_gpu/kernels/metax_kernel/flags_declare.cu
+++ b/backends/metax_gpu/common/flags_declare.cc
@@ -80,6 +80,17 @@ PHI_DEFINE_EXPORTED_bool(
     "faster but it may loss precision in most case. If true, the compute "
     "type will be set to fp16. Default is false.");
 
+PHI_DEFINE_EXPORTED_string(
+    selected_gpus,
+    "",
+    "A list of device ids separated by comma, like: 0,1,2,3. "
+    "This option is useful when doing multi process training and "
+    "each process have only one device (GPU). If you want to use "
+    "all visible devices, set this to empty string. NOTE: the "
+    "reason of doing this is that we want to use P2P communication"
+    "between GPU devices, use CUDA_VISIBLE_DEVICES can only use"
+    "share-memory only.");
+
 PHI_DEFINE_EXPORTED_bool(use_fast_math,
                          false,
                          "Whether to use fast math GPU functions.");
diff --git a/backends/metax_gpu/common/utils.cc b/backends/metax_gpu/common/utils.cc
new file mode 100644
index 00000000000..58e835687d9
--- /dev/null
+++ b/backends/metax_gpu/common/utils.cc
@@ -0,0 +1,297 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "common/utils.h"
+
+#include "glog/logging.h"
+#include "paddle/phi/backends/context_pool.h"
+#include "paddle/phi/backends/custom/custom_context.h"
+
+namespace phi {
+namespace {
+C_Status AsyncMemCpyH2D(const C_Device device,
+                        C_Stream stream,
+                        void* dst,
+                        const void* src,
+                        size_t size) {
+  if (size == 0) {
+    return C_SUCCESS;
+  }
+
+  if (dst == NULL || src == NULL) {
+    return C_ERROR;
+  }
+  cudaError_t cudaErr = cudaSetDevice(device->id);
+  if (cudaErr != cudaSuccess) {
+    return C_ERROR;
+  }
+
+  cudaErr = cudaMemcpyAsync(dst, src, size, cudaMemcpyHostToDevice);
+  if (cudaErr != cudaSuccess) {
+    return C_ERROR;
+  }
+
+  return C_SUCCESS;
+}
+
+C_Status AsyncMemCpyD2H(const C_Device device,
+                        C_Stream stream,
+                        void* dst,
+                        const void* src,
+                        size_t size) {
+  if (size == 0) {
+    return C_SUCCESS;
+  }
+
+  if (dst == NULL || src == NULL) {
+    return C_ERROR;
+  }
+
+  cudaError_t cudaErr = cudaSetDevice(device->id);
+  if (cudaErr != cudaSuccess) {
+    return C_ERROR;
+  }
+
+  cudaErr = cudaMemcpyAsync(dst, src, size, cudaMemcpyDeviceToHost);
+  if (cudaErr != cudaSuccess) {
+    return C_ERROR;
+  }
+
+  return C_SUCCESS;
+}
+
+C_Status AsyncMemCpyD2D(const C_Device device,
+                        C_Stream stream,
+                        void* dst,
+                        const void* src,
+                        size_t size) {
+  if (size == 0) {
+    VLOG(2) << "cudamemcpy successful: " << dst << " " << src << " "
+            << size;  // NOLINT
+    return C_SUCCESS;
+  }
+
+  if (dst == NULL || src == NULL) {
+    return C_ERROR;
+  }
+
+  cudaError_t cudaErr = cudaSetDevice(device->id);
+  if (cudaErr != cudaSuccess) {
+    return C_ERROR;
+  }
+
+  cudaErr = cudaMemcpyAsync(dst, src, size, cudaMemcpyDeviceToDevice);
+  if (cudaErr != cudaSuccess) {
+    return C_ERROR;
+  }
+  VLOG(2) << "cudamemcpy successful: " << dst << " " << src << " "
+          << size;  // NOLINT
+  return C_SUCCESS;
+}
+
+template <typename Context>
+inline void TensorCopy(const Context& dev_ctx,
+                       const phi::DenseTensor& src,
+                       bool blocking,
+                       phi::DenseTensor* dst,
+                       const phi::Place& dst_place = phi::CustomPlace()) {
+  auto* src_ptr = src.data();
+  const auto& src_place = src.place();
+  if (src_ptr == nullptr) {
+    return;
+  }
+  auto dst_place_ = dst_place;
+  if (dst_place_.GetType() != phi::AllocationType::CPU) {
+    dst_place_ = dev_ctx.GetPlace();
+  }
+
+  if (&src == dst) {
+    if (src_place == dst_place_) {
+      VLOG(6) << "Skip copy the same data(" << src_ptr << ") from " << src_place
+              << " to " << dst_place_;
+    } else {
+      VLOG(6) << "Src and dst are the same Tensor, in-place copy data("
+              << src_ptr << ") from " << src_place << " to " << dst_place_;
+      const phi::DenseTensor src_copy = src;
+      TensorCopy(dev_ctx, src_copy, blocking, dst, dst_place_);
+    }
+    return;
+  }
+
+  auto dst_dims = dst->dims();
+  dst->Resize(src.dims());
+  void* dst_ptr = nullptr;
+  if (dst_place_.GetType() != phi::AllocationType::CPU) {
+    dst_ptr = dev_ctx.Alloc(dst, src.dtype());
+  } else {
+    dst_ptr = dev_ctx.HostAlloc(dst, src.dtype());
+  }
+
+  PADDLE_ENFORCE_EQ(
+      dst->place(),
+      dst_place_,
+      phi::errors::Unavailable(
+          "The Dst Tensor's place and dst_place do not match, Tensor's place "
+          "place is %s, dst_place is %s.",
+          dst->place(),
+          dst_place_));
+
+  if (src_ptr == dst_ptr && src_place == dst_place_) {
+    if ((dst_dims == src.dims()) || (src_place == phi::CPUPlace())) {
+      VLOG(3) << "Skip copy the same data async from " << src_ptr << " in "
+              << src_place << " to " << dst_ptr << " in " << dst_place_;
+      return;
+    } else {
+      // scatter memory
+      phi::DenseTensor tmp_dst;
+      tmp_dst.set_meta(dst->meta());
+      tmp_dst.Resize(dst_dims);
+      dst_ptr = dev_ctx.Alloc(&tmp_dst, tmp_dst.dtype());
+      *dst = tmp_dst;
+    }
+  }
+  VLOG(4) << "src:" << src_ptr << " place: " << src_place
+          << " type:" << static_cast<int>(src_place.GetType())
+          << ", dst:" << dst_ptr << " place: " << dst_place_
+          << " type:" << static_cast<int>(dst_place_.GetType());
+
+  C_Stream stream = reinterpret_cast<C_Stream>(dev_ctx.stream());
+
+  auto size =
+      (src.dims().size() != 0 ? src.numel() : 1) * phi::SizeOf(src.dtype());
+  if (UNLIKELY(size) == 0) {
+    return;
+  }
+
+  if (src_place.GetType() == phi::AllocationType::CPU &&
+      dst_place_.GetType() == phi::AllocationType::CUSTOM) {
+    VLOG(6) << "TensorCopy from cpu to cus";
+    C_Device_st device;
+    device.id = dst_place_.GetDeviceId();
+    AsyncMemCpyH2D(&device, stream, dst_ptr, src_ptr, size);
+    if (blocking) {
+      dev_ctx.Wait();
+    }
+  } else if (src_place.GetType() == phi::AllocationType::CUSTOM &&
+             dst_place_.GetType() == phi::AllocationType::CPU) {
+    VLOG(6) << "TensorCopy from cus to cpu";
+    C_Device_st device;
+    device.id = src_place.GetDeviceId();
+    AsyncMemCpyD2H(&device, stream, dst_ptr, src_ptr, size);
+    if (blocking) {
+      dev_ctx.Wait();
+    }
+  } else if (src_place.GetType() == phi::AllocationType::CUSTOM &&
+             dst_place_.GetType() == phi::AllocationType::CUSTOM) {
+    VLOG(6) << "TensorCopy from cus to cus";
+    if (src_place.GetDeviceType() == dst_place_.GetDeviceType()) {
+      if (src_place.GetDeviceId() == dst_place_.GetDeviceId()) {
+        C_Device_st device;
+        device.id = src_place.GetDeviceId();
+        AsyncMemCpyD2D(&device, stream, dst_ptr, src_ptr, size);
+        if (blocking) {
+          dev_ctx.Wait();
+        }
+      } else {
+        PADDLE_THROW(
+            phi::errors::Unimplemented("TensorCopy is not supported."));
+      }
+    } else {
+      PADDLE_THROW(phi::errors::Unimplemented("TensorCopy is not supported."));
+    }
+  } else if (src_place.GetType() == phi::AllocationType::CPU &&
+             dst_place_.GetType() == phi::AllocationType::CPU) {
+    VLOG(6) << "TensorCopy from cpu to cpu";
+    std::memcpy(dst_ptr, src_ptr, size);
+  }
+}
+
+template <typename T = float>
+std::ostream& PrintTensor(std::ostream& os, const phi::DenseTensor& tensor) {
+  phi::DenseTensor cpu_tensor;
+  if (tensor.place().GetType() != phi::AllocationType::CPU) {
+    auto dev_ctx = static_cast<const phi::CustomContext*>(
+        phi::DeviceContextPool::Instance().Get(tensor.place()));
+    TensorCopy(*dev_ctx, tensor, true, &cpu_tensor, phi::CPUPlace());
+  } else {
+    cpu_tensor = tensor;
+  }
+  os << "DenseTensor<";
+  if (tensor.initialized()) {
+    os << phi::DataTypeToString(tensor.dtype()) << ", ";
+    os << tensor.place() << ", ";
+    os << "Shape(" << tensor.dims() << "), ";
+    os << "Strides(" << tensor.strides() << "), ";
+    os << "layout:" << tensor.layout() << ", ";
+    os << "data: [";
+
+    auto ptr = cpu_tensor.data<T>();
+    auto element_num = cpu_tensor.numel();
+    // Note: int8_t && uint8_t is typedef of char, ostream unable to print
+    // properly
+    if (typeid(int8_t) == typeid(T) || typeid(uint8_t) == typeid(T)) {
+      if (element_num > 0) {
+        os << signed(ptr[0]);
+        for (int j = 1; j < element_num; ++j) {
+          os << " " << signed(ptr[j]);
+        }
+      }
+    } else {
+      if (element_num > 0) {
+        os << ptr[0];
+        for (int j = 1; j < element_num; ++j) {
+          os << " " << ptr[j];
+        }
+      }
+    }
+    os << "]";
+  } else {
+    os << "NOT_INITED";
+  }
+  os << ">";
+  return os;
+}
+}  // namespace
+
+#define FOR_EACH_DATA_TYPE_TO_PRINT(_)      \
+  _(bool, phi::DataType::BOOL)              \
+  _(int8_t, phi::DataType::INT8)            \
+  _(uint8_t, phi::DataType::UINT8)          \
+  _(int16_t, phi::DataType::INT16)          \
+  _(uint16_t, phi::DataType::UINT16)        \
+  _(int32_t, phi::DataType::INT32)          \
+  _(uint32_t, phi::DataType::UINT32)        \
+  _(int64_t, phi::DataType::INT64)          \
+  _(uint64_t, phi::DataType::UINT64)        \
+  _(phi::bfloat16, phi::DataType::BFLOAT16) \
+  _(phi::float16, phi::DataType::FLOAT16)   \
+  _(float, phi::DataType::FLOAT32)          \
+  _(double, phi::DataType::FLOAT64)
+
+#define CALL_PRINT_TENSOR(cpp_type, data_type) \
+  case data_type:                              \
+    PrintTensor<cpp_type>(os, t);              \
+    break;
+
+std::ostream& operator<<(std::ostream& os, const phi::DenseTensor& t) {
+  switch (t.dtype()) {
+    FOR_EACH_DATA_TYPE_TO_PRINT(CALL_PRINT_TENSOR)
+    default:
+      VLOG(1) << "PrintTensor unrecognized data type:" << t.dtype();
+  }
+  return os;
+}
+#undef FOR_EACH_DATA_TYPE_TO_PRINT
+#undef CALL_PRINT_TENSOR
+}  // namespace phi
diff --git a/backends/metax_gpu/common/utils.h b/backends/metax_gpu/common/utils.h
new file mode 100644
index 00000000000..74e8aa9d788
--- /dev/null
+++ b/backends/metax_gpu/common/utils.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+
+namespace phi {
+std::ostream& operator<<(std::ostream& os, const phi::DenseTensor& t);
+}

From e533cc49db93959a0e5cabd00e3de8a71156b4b7 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Mon, 13 Oct 2025 10:05:21 +0800
Subject: [PATCH 060/121] [Metax] change_patch (#94)

* [metax] change_patch

---------
---
 backends/metax_gpu/patch/paddle.patch | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch
index 69d714ef6e0..f2e4f067bb2 100755
--- a/backends/metax_gpu/patch/paddle.patch
+++ b/backends/metax_gpu/patch/paddle.patch
@@ -902,11 +902,11 @@ index 9d4bb18d55..ea42cc10a9 100644
          }
        }
 diff --git a/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu
-index b8cfdbf3ce..fa14b94a77 100644
+index acb3b83bc9..264d2a2b3e 100644
 --- a/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu
 +++ b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu
-@@ -14,7 +14,7 @@
- 
+@@ -15,7 +15,7 @@
+ #include "paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.h"
  #include "paddle/phi/core/kernel_registry.h"
  #include "paddle/phi/kernels/funcs/aligned_vector.h"
 -#include "paddle/phi/kernels/fusion/gpu/mmha_util.cu.h"
@@ -915,11 +915,11 @@ index b8cfdbf3ce..fa14b94a77 100644
  namespace phi {
  namespace fusion {
 diff --git a/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu b/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu
-index e838778952..83e805e75a 100644
+index b2d15a59f8..f64582e85a 100644
 --- a/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu
 +++ b/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu
-@@ -14,7 +14,7 @@
- 
+@@ -15,7 +15,7 @@
+ #include "paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.h"
  #include "paddle/phi/core/kernel_registry.h"
  #include "paddle/phi/kernels/funcs/aligned_vector.h"
 -#include "paddle/phi/kernels/fusion/gpu/mmha_util.cu.h"

From d398e1a8627fc862d61ead0aa17f0f8a39715b97 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Mon, 13 Oct 2025 17:02:47 +0800
Subject: [PATCH 061/121] update paddle (#95)

* update paddle

---------
---
 Paddle | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Paddle b/Paddle
index 2588f489910..cc367e8767d 160000
--- a/Paddle
+++ b/Paddle
@@ -1 +1 @@
-Subproject commit 2588f4899106cd27bdfcc84ba4c2f5f7aac570ab
+Subproject commit cc367e8767d49819b5100f22e279cd62a1587670

From 813b9230bc7dc67adbface58967e32faf0119ce8 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Mon, 13 Oct 2025 18:33:50 +0800
Subject: [PATCH 062/121] [metax] fix dot error (#96)

* [metax] fix dot error

---------
---
 backends/metax_gpu/kernels/funcs/blas/blas.h |  8 +++++++-
 backends/metax_gpu/patch/paddle.patch        | 13 +++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/backends/metax_gpu/kernels/funcs/blas/blas.h b/backends/metax_gpu/kernels/funcs/blas/blas.h
index fa4b4643f89..75ea8c921e2 100644
--- a/backends/metax_gpu/kernels/funcs/blas/blas.h
+++ b/backends/metax_gpu/kernels/funcs/blas/blas.h
@@ -282,6 +282,9 @@ class Blas {
   template <typename T>
   T DOT(int n, const T* x, const T* y) const;
 
+  template <typename T>
+  void CUDOT(
+      int n, const T* x, int incx, const T* y, int incy, T* result) const;
   template <typename T>
   void SCAL(int n, const T a, T* x) const;
 
@@ -541,7 +544,10 @@ class BlasT : private Blas<DeviceContext> {
   T DOT(ARGS... args) const {
     return Base()->template DOT<T>(args...);
   }
-
+  template <typename... ARGS>
+  void CUDOT(ARGS... args) const {
+    Base()->template CUDOT<T>(args...);
+  }
   template <typename... ARGS>
   void SCAL(ARGS... args) const {
     Base()->template SCAL<T>(args...);
diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch
index f2e4f067bb2..7ba32b5b399 100755
--- a/backends/metax_gpu/patch/paddle.patch
+++ b/backends/metax_gpu/patch/paddle.patch
@@ -942,6 +942,19 @@ index f0cca0f701..02ea957240 100644
  
  namespace phi {
  // To determine use cudnn or not.
+diff --git a/paddle/phi/kernels/gpu/dot_kernel.cu b/paddle/phi/kernels/gpu/dot_kernel.cu
+index af27ac89ab..ee0edc6b8e 100644
+--- a/paddle/phi/kernels/gpu/dot_kernel.cu
++++ b/paddle/phi/kernels/gpu/dot_kernel.cu
+@@ -15,7 +15,7 @@
+ #include "paddle/phi/kernels/dot_kernel.h"
+ #include "paddle/phi/backends/gpu/gpu_context.h"
+ #include "paddle/phi/core/kernel_registry.h"
+-#include "paddle/phi/kernels/funcs/blas/blas.h"
++#include "kernels/funcs/blas/blas.h"
+ #include "paddle/phi/kernels/funcs/eigen/common.h"
+ 
+ #include "paddle/phi/kernels/full_kernel.h"
 diff --git a/paddle/phi/kernels/gpu/gelu_funcs.h b/paddle/phi/kernels/gpu/gelu_funcs.h
 index 29fa252e96..4ae72b0935 100644
 --- a/paddle/phi/kernels/gpu/gelu_funcs.h

From 6abf13c002bff418b261e20309f71fdd819c28eb Mon Sep 17 00:00:00 2001
From: metax666 <metax_pde@outlook.com>
Date: Tue, 14 Oct 2025 10:41:54 +0800
Subject: [PATCH 063/121] Update metax_work.yaml

---
 .github/workflows/metax_work.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/metax_work.yaml b/.github/workflows/metax_work.yaml
index f73442b6fd5..fd7d04c0843 100644
--- a/.github/workflows/metax_work.yaml
+++ b/.github/workflows/metax_work.yaml
@@ -40,7 +40,7 @@ jobs:
             git fetch origin pull/${{ github.event.pull_request.number }}/head:pull/${{ github.event.pull_request.number }}/head
             git checkout pull/${{ github.event.pull_request.number }}/head
 
-            # git submodule update --init --recursive
+            git submodule update --init --recursive
           fi
 
 
From 16d655b6ad22abe84e484a7bfe0a8c6c52d505a7 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Tue, 14 Oct 2025 15:22:59 +0800
Subject: [PATCH 064/121] [metax]rm opt path and fix activation_kernel bug
 (#98)

* [metax]rm opt path and fix activation_kernel bug

---------
---
 backends/metax_gpu/CMakeLists.txt             | 10 ++++----
 backends/metax_gpu/cmake/dgc.cmake            |  4 +--
 .../activation_grad_kernel_register.cu        | 25 +++++++++++++++----
 .../activation_kernel_register.cu             | 24 ++++++++++++++----
 4 files changed, 46 insertions(+), 17 deletions(-)

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index e357a5e5912..3e92996f9a2 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -703,9 +703,9 @@ file(
 set(CUSTOM_DEVICE_SRCS ${CUDA_SRCS} ${CC_SRCS} ${ERNIE_CORE_SRCS})
 
 set_source_files_properties(${CUSTOM_DEVICE_SRCS} PROPERTIES LANGUAGE CUDA)
-
+set(MACA_PATH $ENV{MACA_PATH})
 set(CMAKE_CUCC_COMPILER "cucc")
-set(CMAKE_CUCC_FLAGS "-I /opt/maca/tools/cu-bridge/include/")
+set(CMAKE_CUCC_FLAGS "-I ${MACA_PATH}/tools/cu-bridge/include/")
 
 add_library(${TARGET_NAME} SHARED ${CUSTOM_DEVICE_SRCS})
 
@@ -734,9 +734,9 @@ target_link_libraries(
   ${WARPRNNT_LIBRARIES}
   ${PADDLE_CORE_LIB})
 
-target_link_libraries(${TARGET_NAME} /opt/maca/lib/libmccl.so)
-target_link_libraries(${TARGET_NAME} /opt/maca/lib/libmcFlashAttn.so)
-target_link_libraries(${TARGET_NAME} /opt/maca/lib/libmcpti.so)
+target_link_libraries(${TARGET_NAME} ${MACA_PATH}/lib/libmccl.so)
+target_link_libraries(${TARGET_NAME} ${MACA_PATH}/lib/libmcFlashAttn.so)
+target_link_libraries(${TARGET_NAME} ${MACA_PATH}/lib/libmcpti.so)
 
 include_directories(BEFORE ${PADDLE_SOURCE_DIR})
 
diff --git a/backends/metax_gpu/cmake/dgc.cmake b/backends/metax_gpu/cmake/dgc.cmake
index 4c54e636d5e..4c61f2e6bcb 100644
--- a/backends/metax_gpu/cmake/dgc.cmake
+++ b/backends/metax_gpu/cmake/dgc.cmake
@@ -62,8 +62,8 @@ if(EXISTS ${DGC_DOWNLOAD_DIR}/${DGC_CACHE_FILENAME})
 else()
   download_dgc()
 endif()
-
-set(CU_BRIDGE_PATH "/opt/maca/tools/cu-bridge")
+set(MACA_PATH $ENV{MACA_PATH})
+set(CU_BRIDGE_PATH "${MACA_PATH}/tools/cu-bridge")
 
 add_custom_command(
   OUTPUT "${CU_BRIDGE_PATH}/bin/nvcc"
diff --git a/backends/metax_gpu/kernels/cuda_kernels/activation_grad_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/activation_grad_kernel_register.cu
index 6cdfb2f5242..6c46ef10c0f 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/activation_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/activation_grad_kernel_register.cu
@@ -119,7 +119,22 @@ void ActivationGradGPUImpl(const Context& dev_ctx,
     ActivationGradGPUImpl<T, Context, funcs::functor_class<T>>( \
         dev_ctx, &x, nullptr, &dout, dx, functor);              \
   }
-
+#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_DOUBLE_ATTRS_DEPX(  \
+    name, functor_class, attr1, attr2)                          \
+  template <typename T, typename Context>                       \
+  void name##GradKernel(const Context& dev_ctx,                 \
+                        const DenseTensor& x,                   \
+                        const DenseTensor& dout,                \
+                        double attr1,                           \
+                        double attr2,                           \
+                        DenseTensor* dx) {                      \
+    funcs::functor_class<T> functor;                            \
+    auto attrs = functor.GetAttrs();                            \
+    *(attrs[0].second) = attr1;                                 \
+    *(attrs[1].second) = attr2;                                 \
+    ActivationGradGPUImpl<T, Context, funcs::functor_class<T>>( \
+        dev_ctx, &x, nullptr, &dout, dx, functor);              \
+  }
 #define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(name, functor_class) \
   template <typename T, typename Context>                             \
   void name##GradKernel(const Context& dev_ctx,                       \
@@ -239,10 +254,10 @@ DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(STanh,
                                                scale_a,
                                                scale_b);
 
-DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(Softplus,
-                                               CudaSoftplusGradFunctor,
-                                               beta,
-                                               threshold);
+DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_DOUBLE_ATTRS_DEPX(Softplus,
+                                                      CudaSoftplusGradFunctor,
+                                                      beta,
+                                                      threshold);
 DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(HardSigmoid,
                                                  CudaHardSigmoidGradFunctor,
                                                  slope,
diff --git a/backends/metax_gpu/kernels/cuda_kernels/activation_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/activation_kernel_register.cu
index f24f3e8abbc..363932cfc28 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/activation_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/activation_kernel_register.cu
@@ -90,7 +90,21 @@ void ActivationGPUImpl(const Context& dev_ctx,
     ActivationGPUImpl<T, Context, funcs::functor_class<T>>( \
         dev_ctx, x, out, functor);                          \
   }
-
+#define DEFINE_GPU_ACT_KERNEL_WITH_TWO_DOUBLE_ATTRS(        \
+    name, functor_class, attr1, attr2)                      \
+  template <typename T, typename Context>                   \
+  void name##Kernel(const Context& dev_ctx,                 \
+                    const DenseTensor& x,                   \
+                    double attr1,                           \
+                    double attr2,                           \
+                    DenseTensor* out) {                     \
+    funcs::functor_class<T> functor;                        \
+    auto attrs = functor.GetAttrs();                        \
+    *(attrs[0].second) = attr1;                             \
+    *(attrs[1].second) = attr2;                             \
+    ActivationGPUImpl<T, Context, funcs::functor_class<T>>( \
+        dev_ctx, x, out, functor);                          \
+  }
 DEFINE_GPU_ACTIVATION_KERNEL(Cos, CudaCosFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(Tan, CudaTanFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(Acos, CudaAcosFunctor)
@@ -139,10 +153,10 @@ DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(HardTanh,
                                      t_min,
                                      t_max)
 DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(Stanh, CudaSTanhFunctor, scale_a, scale_b)
-DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(Softplus,
-                                     CudaSoftplusFunctor,
-                                     beta,
-                                     threshold)
+DEFINE_GPU_ACT_KERNEL_WITH_TWO_DOUBLE_ATTRS(Softplus,
+                                            CudaSoftplusFunctor,
+                                            beta,
+                                            threshold)
 DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(HardSigmoid,
                                      CudaHardSigmoidFunctor,
                                      slope,

From 4b596b94e638e29c7b520f96524eb9bbf0acce4e Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Tue, 14 Oct 2025 17:17:54 +0800
Subject: [PATCH 065/121] updata_paddle (#99)

* updata paddle

---------
---
 Paddle | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Paddle b/Paddle
index cc367e8767d..89f4bd92f49 160000
--- a/Paddle
+++ b/Paddle
@@ -1 +1 @@
-Subproject commit cc367e8767d49819b5100f22e279cd62a1587670
+Subproject commit 89f4bd92f49e15a9e1803a9e582526b2b8e4557d

From 94623f4d0492d688e8753655dc6229e7cecc0fa9 Mon Sep 17 00:00:00 2001
From: ZhouDuan <136539532+1184319564@users.noreply.github.com>
Date: Thu, 16 Oct 2025 10:34:54 +0800
Subject: [PATCH 066/121] [Metax] Fix some tests (#102)

* fix some tests
---
 backends/metax_gpu/tests/CMakeLists.txt       |    8 +-
 .../unit_test/test_conv3d_layer_metax.py      |  381 ++++++
 .../test_conv3d_transpose_op_metax.py         |  764 ++++++++++++
 .../test_conv3d_transpose_part2_op_metax.py   |  108 ++
 .../unit_test/test_deform_conv2d_metax.py     |  323 +++++
 .../test_deformable_conv_op_metax.py          |  504 ++++++++
 .../test_deformable_conv_v1_op_metax.py       |  319 +++++
 .../unit_test/test_einsum_0d_tensor_metax.py  |  201 +++
 .../tests/unit_test/test_fc_op_metax.py       |  138 ++
 .../test_imperative_double_grad_metax.py      | 1106 +++++++++++++++++
 .../unit_test/test_linalg_matrix_exp_metax.py |  268 ++++
 11 files changed, 4119 insertions(+), 1 deletion(-)
 create mode 100644 backends/metax_gpu/tests/unit_test/test_conv3d_layer_metax.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_conv3d_transpose_op_metax.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_conv3d_transpose_part2_op_metax.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_deform_conv2d_metax.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_deformable_conv_op_metax.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_deformable_conv_v1_op_metax.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_einsum_0d_tensor_metax.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_fc_op_metax.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_imperative_double_grad_metax.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_linalg_matrix_exp_metax.py

diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt
index 0c84ada4b65..084b5b8c601 100755
--- a/backends/metax_gpu/tests/CMakeLists.txt
+++ b/backends/metax_gpu/tests/CMakeLists.txt
@@ -49,7 +49,13 @@ foreach(test_name ${TEST_PROGRAMS})
     continue()
   endif()
 
-  set(CURRENT_TEST_PROGRAM ${PADDLE_LEGACY_TEST_PATH}/${test_name}.py)
+  string(FIND "${test_name}" "metax" METAX_SUFFIX_POS)
+  if(NOT METAX_SUFFIX_POS EQUAL -1)
+    set(CURRENT_TEST_PROGRAM ${METAX_UNIT_TEST_PATH}/${test_name}.py)
+  else()
+    set(CURRENT_TEST_PROGRAM ${PADDLE_LEGACY_TEST_PATH}/${test_name}.py)
+  endif()
+
   if(NOT EXISTS ${CURRENT_TEST_PROGRAM})
     message(WARNING "${CURRENT_TEST_PROGRAM} is not exist, skip it.")
   else()
diff --git a/backends/metax_gpu/tests/unit_test/test_conv3d_layer_metax.py b/backends/metax_gpu/tests/unit_test/test_conv3d_layer_metax.py
new file mode 100644
index 00000000000..cd4cd290065
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_conv3d_layer_metax.py
@@ -0,0 +1,381 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+import numpy as np
+from op_test import get_device_place, is_custom_device
+from test_conv3d_op import conv3d_forward_naive
+
+import paddle
+import paddle.base.dygraph as dg
+import paddle.nn.functional as F
+from paddle import base, nn
+from paddle.base import core
+
+core.set_cudnn_switch(False)
+
+
+class Conv3DTestCase(unittest.TestCase):
+    def __init__(
+        self,
+        methodName="runTest",
+        batch_size=4,
+        spatial_shape=(8, 8, 8),
+        num_channels=6,
+        num_filters=8,
+        filter_size=3,
+        padding=0,
+        stride=1,
+        dilation=1,
+        groups=1,
+        no_bias=False,
+        data_format="NCDHW",
+        dtype="float32",
+    ):
+        super().__init__(methodName)
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.num_filters = num_filters
+        self.spatial_shape = spatial_shape
+        self.filter_size = filter_size
+
+        self.padding = padding
+        self.stride = stride
+        self.dilation = dilation
+        self.groups = groups
+        self.no_bias = no_bias
+        self.data_format = data_format
+        self.dtype = dtype
+
+    def setUp(self):
+        self.channel_last = self.data_format == "NDHWC"
+        if self.channel_last:
+            input_shape = (
+                self.batch_size,
+                *self.spatial_shape,
+                self.num_channels,
+            )
+        else:
+            input_shape = (
+                self.batch_size,
+                self.num_channels,
+                *self.spatial_shape,
+            )
+        self.input = np.random.randn(*input_shape).astype(self.dtype)
+
+        if isinstance(self.filter_size, int):
+            filter_size = [self.filter_size] * 3
+        else:
+            filter_size = self.filter_size
+        self.weight_shape = weight_shape = (
+            self.num_filters,
+            self.num_channels // self.groups,
+            *filter_size,
+        )
+        self.weight = np.random.uniform(-1, 1, size=weight_shape).astype(self.dtype)
+        if not self.no_bias:
+            self.bias = np.random.uniform(-1, 1, size=(self.num_filters,)).astype(
+                self.dtype
+            )
+        else:
+            self.bias = None
+
+    def base_layer(self, place):
+        main = base.Program()
+        start = base.Program()
+        with (
+            base.unique_name.guard(),
+            base.program_guard(main, start),
+        ):
+            input_shape = (
+                (-1, -1, -1, -1, self.num_channels)
+                if self.channel_last
+                else (-1, self.num_channels, -1, -1, -1)
+            )
+            x_var = paddle.static.data("input", input_shape, dtype=self.dtype)
+            weight_attr = paddle.nn.initializer.Assign(self.weight)
+            if self.bias is None:
+                bias_attr = False
+            else:
+                bias_attr = paddle.nn.initializer.Assign(self.bias)
+            y_var = paddle.nn.Conv3D(
+                in_channels=self.num_channels,
+                out_channels=self.num_filters,
+                kernel_size=self.filter_size,
+                stride=self.stride,
+                padding=self.padding,
+                dilation=self.dilation,
+                groups=self.groups,
+                padding_mode="zeros",
+                weight_attr=weight_attr,
+                bias_attr=bias_attr,
+                data_format=self.data_format,
+            )(x_var)
+        feed_dict = {"input": self.input}
+        exe = base.Executor(place)
+        exe.run(start)
+        (y_np,) = exe.run(main, feed=feed_dict, fetch_list=[y_var])
+        return y_np
+
+    def functional(self, place):
+        main = base.Program()
+        start = base.Program()
+        with (
+            base.unique_name.guard(),
+            base.program_guard(main, start),
+        ):
+            input_shape = (
+                (-1, -1, -1, -1, self.num_channels)
+                if self.channel_last
+                else (-1, self.num_channels, -1, -1, -1)
+            )
+            x_var = paddle.static.data("input", input_shape, dtype=self.dtype)
+            w_var = paddle.static.data("weight", self.weight_shape, dtype=self.dtype)
+            if not self.no_bias:
+                b_var = paddle.static.data(
+                    "bias", (self.num_filters,), dtype=self.dtype
+                )
+            else:
+                b_var = None
+            y_var = F.conv3d(
+                x_var,
+                w_var,
+                b_var,
+                padding=self.padding,
+                stride=self.stride,
+                dilation=self.dilation,
+                groups=self.groups,
+                data_format=self.data_format,
+            )
+        feed_dict = {"input": self.input, "weight": self.weight}
+        if self.bias is not None:
+            feed_dict["bias"] = self.bias
+        exe = base.Executor(place)
+        exe.run(start)
+        (y_np,) = exe.run(main, feed=feed_dict, fetch_list=[y_var])
+        return y_np
+
+    def paddle_nn_layer(self):
+        x_var = paddle.to_tensor(self.input)
+        x_var.stop_gradient = False
+        conv = nn.Conv3D(
+            self.num_channels,
+            self.num_filters,
+            self.filter_size,
+            padding=self.padding,
+            stride=self.stride,
+            dilation=self.dilation,
+            groups=self.groups,
+            data_format=self.data_format,
+        )
+        conv.weight.set_value(self.weight)
+        if not self.no_bias:
+            conv.bias.set_value(self.bias)
+        y_var = conv(x_var)
+        y_var.backward()
+        y_np = y_var.numpy()
+        t1 = x_var.gradient()
+        return y_np, t1
+
+    def _test_pir_equivalence(self, place):
+        with paddle.pir_utils.IrGuard():
+            result1 = self.base_layer(place)
+            result2 = self.functional(place)
+        with dg.guard(place):
+            result3, g1 = self.paddle_nn_layer()
+        np.testing.assert_array_almost_equal(result1, result2)
+        np.testing.assert_array_almost_equal(result2, result3)
+
+    def runTest(self):
+        place = base.CPUPlace()
+        self._test_pir_equivalence(place)
+
+        if base.core.is_compiled_with_cuda() or is_custom_device():
+            place = get_device_place()
+            self._test_pir_equivalence(place)
+
+
+class Conv3DErrorTestCase(Conv3DTestCase):
+    def runTest(self):
+        place = base.CPUPlace()
+        with (
+            dg.guard(place),
+            self.assertRaises(ValueError),
+        ):
+            self.paddle_nn_layer()
+
+
+def add_cases(suite):
+    suite.addTest(Conv3DTestCase(methodName="runTest"))
+    suite.addTest(Conv3DTestCase(methodName="runTest", stride=[1, 2, 1], dilation=2))
+    suite.addTest(Conv3DTestCase(methodName="runTest", stride=2, dilation=(2, 1, 2)))
+    suite.addTest(Conv3DTestCase(methodName="runTest", padding="same", no_bias=True))
+    suite.addTest(
+        Conv3DTestCase(methodName="runTest", filter_size=(3, 2, 3), padding="valid")
+    )
+    suite.addTest(Conv3DTestCase(methodName="runTest", padding=(2, 3, 1)))
+    suite.addTest(Conv3DTestCase(methodName="runTest", padding=[1, 2, 2, 1, 2, 3]))
+    suite.addTest(
+        Conv3DTestCase(
+            methodName="runTest",
+            padding=[[0, 0], [0, 0], [1, 2], [2, 1], [2, 2]],
+        )
+    )
+    suite.addTest(Conv3DTestCase(methodName="runTest", data_format="NDHWC"))
+    suite.addTest(
+        Conv3DTestCase(
+            methodName="runTest",
+            data_format="NDHWC",
+            padding=[[0, 0], [1, 1], [3, 3], [2, 2], [0, 0]],
+        )
+    )
+    suite.addTest(Conv3DTestCase(methodName="runTest", groups=2, padding="valid"))
+    suite.addTest(
+        Conv3DTestCase(
+            methodName="runTest",
+            num_filters=6,
+            num_channels=3,
+            groups=3,
+            padding="valid",
+        )
+    )
+
+
+def add_error_cases(suite):
+    suite.addTest(Conv3DErrorTestCase(methodName="runTest", num_channels=5, groups=2))
+    suite.addTest(
+        Conv3DErrorTestCase(
+            methodName="runTest", num_channels=5, groups=2, padding=[-1, 1, 3]
+        )
+    )
+
+
+def load_tests(loader, standard_tests, pattern):
+    suite = unittest.TestSuite()
+    add_cases(suite)
+    add_error_cases(suite)
+    return suite
+
+
+def get_places():
+    places = []
+    if core.is_compiled_with_xpu():
+        places.append(paddle.device.XPUPlace(0))
+    elif core.is_compiled_with_cuda():
+        places.append(paddle.CUDAPlace(0))
+    places.append(paddle.CPUPlace())
+    return places
+
+
+class TestConv3dAPI_Compatibility(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(2025)
+        self.places = get_places()
+        self.shape_x = [2, 3, 8, 8, 8]  # NCDHW
+        self.shape_w = [6, 3, 3, 3, 3]  # Co, Cin, kD, kH, kW
+        self.dtype = "float32"
+        self.init_data()
+
+    def init_data(self):
+        self.np_x = np.random.rand(*self.shape_x).astype(self.dtype)
+        self.np_w = np.random.rand(*self.shape_w).astype(self.dtype)
+        conv_param = {
+            "stride": [1, 1, 1],
+            "pad": [0, 0, 0],
+            "dilation": [1, 1, 1],
+        }
+        self.np_ref_out = conv3d_forward_naive(self.np_x, self.np_w, 1, conv_param)
+
+    def test_dygraph_Compatibility(self):
+        for place in self.places:
+            paddle.device.set_device(place)
+            paddle.disable_static()
+            x = paddle.to_tensor(self.np_x)
+            w = paddle.to_tensor(self.np_w)
+
+            paddle_dygraph_out = []
+            # Position args (args)
+            out1 = paddle.nn.functional.conv3d(x, w)
+            paddle_dygraph_out.append(out1)
+            # Key words args (kwargs) for paddle
+            out2 = paddle.nn.functional.conv3d(x=x, weight=w)
+            paddle_dygraph_out.append(out2)
+            # Key words args for alias compatibility
+            out3 = paddle.nn.functional.conv3d(input=x, weight=w)
+            paddle_dygraph_out.append(out3)
+            # Combined args and kwargs
+            out4 = paddle.nn.functional.conv3d(x, weight=w)
+            paddle_dygraph_out.append(out4)
+
+            # refer to test/xpu/test_conv3d_op_xpu.py
+            if isinstance(place, core.XPUPlace):
+                rtol = 5e-3
+                atol = 5e-3
+            else:
+                rtol = 1e-5
+                atol = 0
+
+            # Check all dygraph results against reference
+            for out in paddle_dygraph_out:
+                np.testing.assert_allclose(
+                    self.np_ref_out, out.numpy(), rtol=rtol, atol=atol
+                )
+            paddle.enable_static()
+
+    def test_static_Compatibility(self):
+        paddle.enable_static()
+
+        fetch_list = []
+        main = paddle.static.Program()
+        startup = paddle.static.Program()
+        with base.program_guard(main, startup):
+            x = paddle.static.data(name="x", shape=self.shape_x, dtype=self.dtype)
+            w = paddle.static.data(name="w", shape=self.shape_w, dtype=self.dtype)
+
+            # Position args (args)
+            out1 = paddle.nn.functional.conv3d(x, w)
+            fetch_list.append(out1)
+            # Key words args (kwargs) for paddle
+            out2 = paddle.nn.functional.conv3d(x=x, weight=w)
+            fetch_list.append(out2)
+            # Key words args for alias compatibility
+            out3 = paddle.nn.functional.conv3d(input=x, weight=w)
+            fetch_list.append(out3)
+            # Combined args and kwargs
+            out4 = paddle.nn.functional.conv3d(x, weight=w)
+            fetch_list.append(out4)
+
+            for place in self.places:
+                # refer to test/xpu/test_conv2d_op_xpu.py
+                if isinstance(place, core.XPUPlace):
+                    rtol = 5e-3
+                    atol = 5e-3
+                else:
+                    rtol = 1e-5
+                    atol = 0
+
+                exe = base.Executor(place)
+                fetches = exe.run(
+                    main,
+                    feed={"x": self.np_x, "w": self.np_w},
+                    fetch_list=fetch_list,
+                )
+                for out in fetches:
+                    np.testing.assert_allclose(
+                        out, self.np_ref_out, rtol=rtol, atol=atol
+                    )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_conv3d_transpose_op_metax.py b/backends/metax_gpu/tests/unit_test/test_conv3d_transpose_op_metax.py
new file mode 100644
index 00000000000..6f55aac3361
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_conv3d_transpose_op_metax.py
@@ -0,0 +1,764 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+
+paddle.enable_static()
+from op_test import (
+    OpTest,
+    copy_bits_from_float_to_uint16,
+    get_device_place,
+    is_custom_device,
+)
+
+from paddle.base import core
+
+core.set_cudnn_switch(False)
+
+
+def convert_float_to_uint16(float_list, data_format="NCHW"):
+    if data_format == "NHWC":
+        float_list = np.transpose(float_list, [0, 4, 1, 2, 3])
+
+    new_output = []
+    for x in np.nditer(float_list):
+        new_output.append(np.uint16(copy_bits_from_float_to_uint16(x)))
+    new_output = np.reshape(new_output, float_list.shape).view(np.uint16)
+
+    if data_format == "NHWC":
+        new_output = np.transpose(new_output, [0, 2, 3, 4, 1])
+    return new_output
+
+
+def conv3dtranspose_forward_naive(input_, filter_, attrs):
+    padding_algorithm = attrs["padding_algorithm"]
+    if padding_algorithm not in ["SAME", "VALID", "EXPLICIT"]:
+        raise ValueError(
+            f"Unknown Attr(padding_algorithm): '{padding_algorithm}'. "
+            "It can only be 'SAME' or 'VALID'."
+        )
+
+    if attrs["data_format"] == "NHWC":
+        input_ = np.transpose(input_, [0, 4, 1, 2, 3])
+    in_n, in_c, in_d, in_h, in_w = input_.shape
+    f_c, f_out_c, f_d, f_h, f_w = filter_.shape
+    groups = attrs["groups"]
+    assert in_c == f_c
+    out_c = f_out_c * groups
+    sub_in_c = in_c // groups
+
+    stride, pad, dilations = (
+        attrs["strides"],
+        attrs["paddings"],
+        attrs["dilations"],
+    )
+
+    def _get_padding_with_SAME(input_shape, kernel_size, kernel_stride):
+        padding = []
+        for input_size, filter_size, stride_size in zip(
+            input_shape, kernel_size, kernel_stride
+        ):
+            out_size = int((input_size + stride_size - 1) / stride_size)
+            pad_sum = np.max(
+                ((out_size - 1) * stride_size + filter_size - input_size, 0)
+            )
+            pad_0 = int(pad_sum / 2)
+            pad_1 = int(pad_sum - pad_0)
+            padding.append(pad_0)
+            padding.append(pad_1)
+        return padding
+
+    ksize = filter_.shape[2:5]
+    if padding_algorithm == "VALID":
+        pad = [0, 0, 0, 0, 0, 0]
+    elif padding_algorithm == "SAME":
+        dilations = [1, 1, 1]
+        input_data_shape = input_.shape[2:5]
+        pad = _get_padding_with_SAME(input_data_shape, ksize, stride)
+
+    pad_d_0, pad_d_1 = pad[0], pad[0]
+    pad_h_0, pad_h_1 = pad[1], pad[1]
+    pad_w_0, pad_w_1 = pad[2], pad[2]
+    if len(pad) == 6:
+        pad_d_0, pad_d_1 = pad[0], pad[1]
+        pad_h_0, pad_h_1 = pad[2], pad[3]
+        pad_w_0, pad_w_1 = pad[4], pad[5]
+
+    d_block_d = dilations[0] * (f_d - 1) + 1
+    d_block_h = dilations[1] * (f_h - 1) + 1
+    d_block_w = dilations[2] * (f_w - 1) + 1
+    out_d = (in_d - 1) * stride[0] + d_block_d
+    out_h = (in_h - 1) * stride[1] + d_block_h
+    out_w = (in_w - 1) * stride[2] + d_block_w
+    out = np.zeros((in_n, out_c, out_d, out_h, out_w))
+
+    for n in range(in_n):
+        for d in range(in_d):
+            for i in range(in_h):
+                for j in range(in_w):
+                    for g in range(groups):
+                        input_masked = input_[
+                            n, g * sub_in_c : (g + 1) * sub_in_c, d, i, j
+                        ]  # (c)
+                        input_masked = np.reshape(input_masked, (sub_in_c, 1, 1, 1))
+                        input_masked = np.tile(input_masked, (1, f_d, f_h, f_w))
+
+                        for k in range(f_out_c):
+                            tmp_out = np.sum(
+                                input_masked
+                                * filter_[
+                                    g * sub_in_c : (g + 1) * sub_in_c,
+                                    k,
+                                    :,
+                                    :,
+                                    :,
+                                ],
+                                axis=0,
+                            )
+                            d1, d2 = d * stride[0], d * stride[0] + d_block_d
+                            i1, i2 = i * stride[1], i * stride[1] + d_block_h
+                            j1, j2 = j * stride[2], j * stride[2] + d_block_w
+                            out[
+                                n,
+                                g * f_out_c + k,
+                                d1 : d2 : dilations[0],
+                                i1 : i2 : dilations[1],
+                                j1 : j2 : dilations[2],
+                            ] += tmp_out
+
+    out = out[
+        :,
+        :,
+        pad_d_0 : out_d - pad_d_1,
+        pad_h_0 : out_h - pad_h_1,
+        pad_w_0 : out_w - pad_w_1,
+    ]
+    if attrs["data_format"] == "NHWC":
+        out = np.transpose(out, [0, 2, 3, 4, 1])
+    return out
+
+
+def create_test_cudnn_fp16_class(parent, grad_check=True):
+    @unittest.skipIf(
+        not (core.is_compiled_with_cuda() or is_custom_device()),
+        "core is not compiled with CUDA",
+    )
+    class TestConv3DTransposeCUDNNFP16(parent):
+        def init_kernel_type(self):
+            self.use_cudnn = True
+            self.dtype = np.float16
+
+        def test_check_output(self):
+            if core.is_compiled_with_cuda() or is_custom_device():
+                place = get_device_place()
+                if core.is_float16_supported(place):
+                    self.check_output_with_place(place, atol=2e-2)
+
+        def test_check_grad_no_filter(self):
+            place = get_device_place()
+            if core.is_float16_supported(place) and grad_check:
+                self.check_grad_with_place(
+                    place, ["Input"], "Output", no_grad_set={"Filter"}
+                )
+
+        def test_check_grad_no_input(self):
+            place = get_device_place()
+            if core.is_float16_supported(place) and grad_check:
+                self.check_grad_with_place(
+                    place, ["Filter"], "Output", no_grad_set={"Input"}
+                )
+
+    cls_name = "{}_{}".format(parent.__name__, "CUDNNFP16OP")
+    TestConv3DTransposeCUDNNFP16.__name__ = cls_name
+    globals()[cls_name] = TestConv3DTransposeCUDNNFP16
+
+
+def create_test_cudnn_bf16_class(parent):
+    @unittest.skipIf(
+        not (core.is_compiled_with_cuda() or is_custom_device())
+        or not core.is_bfloat16_supported(get_device_place()),
+        "core is not compiled with CUDA and do not support bfloat16",
+    )
+    class TestConv3DTransposeCUDNNBF16(parent):
+        def init_kernel_type(self):
+            self.use_cudnn = True
+            self.dtype = np.uint16
+
+        def test_check_output(self):
+            place = get_device_place()
+            self.check_output_with_place(place)
+
+        def test_check_grad(self):
+            place = get_device_place()
+            self.check_grad_with_place(
+                place,
+                {"Input", "Filter"},
+                "Output",
+            )
+
+        def test_check_grad_no_filter(self):
+            place = get_device_place()
+            self.check_grad_with_place(
+                place,
+                ["Input"],
+                "Output",
+                no_grad_set={"Filter"},
+            )
+
+        def test_check_grad_no_input(self):
+            place = get_device_place()
+            self.check_grad_with_place(
+                place,
+                ["Filter"],
+                "Output",
+                no_grad_set={"Input"},
+            )
+
+    cls_name = "{}_{}".format(parent.__name__, "CUDNNBF16OP")
+    TestConv3DTransposeCUDNNBF16.__name__ = cls_name
+    globals()[cls_name] = TestConv3DTransposeCUDNNBF16
+
+
+def conv3d_transpose_wrapper(
+    x,
+    weight,
+    stride=1,
+    padding=0,
+    output_padding=[],
+    output_size=[],
+    padding_algorithm="EXPLICIT",
+    groups=1,
+    dilation=1,
+    data_format="NCDHW",
+):
+    if data_format == "AnyLayout":
+        data_format = "NCDHW"
+    return paddle._C_ops.conv3d_transpose(
+        x,
+        weight,
+        stride,
+        padding,
+        output_padding,
+        output_size,
+        padding_algorithm,
+        groups,
+        dilation,
+        data_format,
+    )
+
+
+class TestConv3DTransposeOp(OpTest):
+    def setUp(self):
+        # init as conv transpose
+        self.use_cudnn = False
+        self.check_no_input = False
+        self.check_no_filter = False
+        self.data_format = "NCHW"
+        self.pad = [0, 0, 0]
+        self.padding_algorithm = "EXPLICIT"
+        self.init_op_type()
+        self.init_kernel_type()
+        self.init_test_case()
+
+        if self.is_bfloat16_op():
+            input = np.random.random(self.input_size).astype(np.float32)
+            filter = np.random.random(self.filter_size).astype(np.float32)
+        else:
+            input = np.random.random(self.input_size).astype(self.dtype)
+            filter = np.random.random(self.filter_size).astype(self.dtype)
+
+        self.attrs = {
+            "strides": self.stride,
+            "paddings": self.pad,
+            "padding_algorithm": self.padding_algorithm,
+            "dilations": self.dilations,
+            "groups": self.groups,
+            "use_cudnn": self.use_cudnn,
+            "data_format": self.data_format,
+        }
+
+        output = conv3dtranspose_forward_naive(input, filter, self.attrs).astype(
+            "float32"
+        )
+
+        if self.is_bfloat16_op():
+            self.inputs = {
+                "Input": convert_float_to_uint16(input),
+                "Filter": convert_float_to_uint16(filter),
+            }
+        else:
+            self.inputs = {
+                "Input": input,
+                "Filter": filter,
+            }
+            output = output.astype(self.dtype)
+
+        self.outputs = {"Output": output}
+
+    def test_check_output(self):
+        if self.use_cudnn:
+            place = get_device_place()
+            self.check_output_with_place(place, atol=1e-5)
+        else:
+            self.check_output()
+
+    def test_check_grad(self):
+        if self.use_cudnn:
+            place = get_device_place()
+            self.check_grad_with_place(
+                place,
+                {"Input", "Filter"},
+                "Output",
+                max_relative_error=0.03,
+            )
+        else:
+            self.check_grad({"Input", "Filter"}, "Output", max_relative_error=0.03)
+
+    def test_check_grad_no_filter(self):
+        if self.use_cudnn:
+            place = get_device_place()
+            self.check_grad_with_place(
+                place,
+                ["Input"],
+                "Output",
+                max_relative_error=0.03,
+                no_grad_set={"Filter"},
+            )
+        elif self.check_no_filter:
+            self.check_grad(
+                ["Input"],
+                "Output",
+                max_relative_error=0.03,
+                no_grad_set={"Filter"},
+            )
+
+    def test_check_grad_no_input(self):
+        if self.use_cudnn:
+            place = get_device_place()
+            self.check_grad_with_place(
+                place,
+                ["Filter"],
+                "Output",
+                max_relative_error=0.03,
+                no_grad_set={"Input"},
+            )
+        elif self.check_no_input:
+            self.check_grad(
+                ["Filter"],
+                "Output",
+                max_relative_error=0.03,
+                no_grad_set={"Input"},
+            )
+
+    def init_test_case(self):
+        self.pad = [0, 0, 0]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [2, 3, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+    def init_op_type(self):
+        self.op_type = "conv3d_transpose"
+        self.python_api = conv3d_transpose_wrapper
+
+    def init_kernel_type(self):
+        self.dtype = np.float32
+
+
+class TestWithSymmetricPad(TestConv3DTransposeOp):
+    def init_test_case(self):
+        self.check_no_input = True
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [1, 2, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+
+class TestWithAsymmetricPad(TestConv3DTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 0, 1, 0, 1, 2]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [1, 2, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+
+class TestWithSAMEPad(TestConv3DTransposeOp):
+    def init_test_case(self):
+        self.stride = [1, 1, 2]
+        self.dilations = [1, 2, 1]
+        self.groups = 1
+        self.input_size = [1, 2, 5, 5, 6]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 4]
+        self.padding_algorithm = "SAME"
+
+
+class TestWithVALIDPad(TestConv3DTransposeOp):
+    def init_test_case(self):
+        self.stride = [2, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [1, 2, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 4, 3]
+        self.padding_algorithm = "VALID"
+
+
+class TestWithStride(TestConv3DTransposeOp):
+    def init_test_case(self):
+        self.check_no_filter = True
+        self.pad = [1, 1, 1]
+        self.stride = [2, 2, 2]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [1, 2, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+
+class TestWithGroups(TestConv3DTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 2
+        self.input_size = [1, 2, 5, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 3, 3, 3, 3]
+
+
+class TestWithDilation(TestConv3DTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [2, 2, 2]
+        self.groups = 1
+        self.input_size = [1, 2, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+
+class Test_NHWC(TestConv3DTransposeOp):
+    def init_test_case(self):
+        self.pad = [0, 0, 0]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [1, 5, 5, 5, 2]  # NDHWC
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+        self.data_format = "NHWC"
+
+
+# ------------ test_cudnn ------------
+@unittest.skipIf(
+    not (core.is_compiled_with_cuda() or is_custom_device()),
+    "core is not compiled with CUDA",
+)
+class TestCUDNN(TestConv3DTransposeOp):
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d_transpose"
+        self.python_api = conv3d_transpose_wrapper
+
+
+@unittest.skipIf(
+    not (core.is_compiled_with_cuda() or is_custom_device()),
+    "core is not compiled with CUDA",
+)
+class TestCUDNNWithSymmetricPad(TestWithSymmetricPad):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [1, 2, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d_transpose"
+        self.python_api = conv3d_transpose_wrapper
+
+
+@unittest.skipIf(
+    not (core.is_compiled_with_cuda() or is_custom_device()),
+    "core is not compiled with CUDA",
+)
+class TestCUDNNWithAsymmetricPad(TestWithAsymmetricPad):
+    def init_test_case(self):
+        self.pad = [1, 1, 1, 0, 0, 2]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [1, 2, 4, 4, 4]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d_transpose"
+        self.python_api = conv3d_transpose_wrapper
+
+
+@unittest.skipIf(
+    not (core.is_compiled_with_cuda() or is_custom_device()),
+    "core is not compiled with CUDA",
+)
+class TestCUDNNWithSAMEPad(TestWithSAMEPad):
+    def init_test_case(self):
+        self.stride = [1, 1, 2]
+        self.dilations = [1, 2, 1]
+        self.groups = 1
+        self.input_size = [1, 2, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 4, 3]
+        self.padding_algorithm = "SAME"
+
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d_transpose"
+        self.python_api = conv3d_transpose_wrapper
+
+
+@unittest.skipIf(
+    not (core.is_compiled_with_cuda() or is_custom_device()),
+    "core is not compiled with CUDA",
+)
+class TestCUDNNWithVALIDPad(TestWithVALIDPad):
+    def init_test_case(self):
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [1, 2, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+        self.padding_algorithm = "VALID"
+
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d_transpose"
+        self.python_api = conv3d_transpose_wrapper
+
+
+@unittest.skipIf(
+    not (core.is_compiled_with_cuda() or is_custom_device()),
+    "core is not compiled with CUDA",
+)
+class TestCUDNNWithStride(TestWithStride):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [2, 2, 2]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [1, 2, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d_transpose"
+        self.python_api = conv3d_transpose_wrapper
+
+
+@unittest.skipIf(
+    not (core.is_compiled_with_cuda() or is_custom_device()),
+    "core is not compiled with CUDA",
+)
+class TestCUDNNWithGroups(TestWithGroups):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 2
+        self.input_size = [1, 2, 5, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 3, 3, 3, 3]
+
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d_transpose"
+        self.python_api = conv3d_transpose_wrapper
+
+        # Please Don't remove the following code.
+        # Currently, CI use cudnn V5.0 which not support dilation conv.
+        # class TestCUDNNWithDilation(TestWithDilation):
+        #     def init_test_case(self):
+        #         self.pad = [1, 1, 1]
+        #         self.stride = [2, 2, 2]
+        #         self.dilations = [2, 2, 2]
+        #         self.input_size = [2, 3, 5, 5, 5]  # NCDHW
+        #         f_c = self.input_size[1]
+        #         self.filter_size = [f_c, 6, 3, 3, 3]
+        #
+        #     def init_op_type(self):
+        #         self.op_type = "conv3d_transpose"
+        self.python_api = conv3d_transpose_wrapper
+
+
+@unittest.skipIf(
+    not (core.is_compiled_with_cuda() or is_custom_device()),
+    "core is not compiled with CUDA",
+)
+class TestCUDNN_NHWC(TestConv3DTransposeOp):
+    def init_test_case(self):
+        self.pad = [0, 0, 0]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [1, 5, 5, 5, 2]  # NDHWC
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+        self.data_format = "NHWC"
+
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d_transpose"
+        self.python_api = conv3d_transpose_wrapper
+
+
+@unittest.skipIf(
+    not (core.is_compiled_with_cuda() or is_custom_device()),
+    "core is not compiled with CUDA",
+)
+class TestCUDNNWithSymmetricPad_NHWC(TestWithSymmetricPad):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [1, 5, 5, 5, 2]  # NDHWC
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+        self.data_format = "NHWC"
+
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d_transpose"
+        self.python_api = conv3d_transpose_wrapper
+
+
+@unittest.skipIf(
+    not (core.is_compiled_with_cuda() or is_custom_device()),
+    "core is not compiled with CUDA",
+)
+class TestCUDNNWithAsymmetricPad_NHWC(TestWithAsymmetricPad):
+    def init_test_case(self):
+        self.pad = [1, 0, 1, 0, 0, 2]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [1, 5, 5, 5, 2]  # NDHWC
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+        self.data_format = "NHWC"
+
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d_transpose"
+        self.python_api = conv3d_transpose_wrapper
+
+
+@unittest.skipIf(
+    not (core.is_compiled_with_cuda() or is_custom_device()),
+    "core is not compiled with CUDA",
+)
+class TestCUDNNWithStride_NHWC(TestWithStride):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [2, 2, 2]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [1, 5, 5, 5, 2]  # NDHWC
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+        self.data_format = "NHWC"
+
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d_transpose"
+        self.python_api = conv3d_transpose_wrapper
+
+
+@unittest.skipIf(
+    not (core.is_compiled_with_cuda() or is_custom_device()),
+    "core is not compiled with CUDA",
+)
+class TestCUDNNWithGroups_NHWC(TestWithGroups):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 2
+        self.input_size = [1, 5, 5, 5, 2]  # NDHWC
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 3, 3, 3, 3]
+        self.data_format = "NHWC"
+
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d_transpose"
+        self.python_api = conv3d_transpose_wrapper
+
+
+# ----------------Conv3DTransposeCUDNN fp16----------------
+create_test_cudnn_fp16_class(TestConv3DTransposeOp)
+create_test_cudnn_fp16_class(TestWithSymmetricPad)
+create_test_cudnn_fp16_class(TestWithAsymmetricPad)
+create_test_cudnn_fp16_class(TestWithSAMEPad)
+create_test_cudnn_fp16_class(TestWithVALIDPad)
+create_test_cudnn_fp16_class(TestWithStride)
+create_test_cudnn_fp16_class(TestWithGroups)
+create_test_cudnn_fp16_class(TestWithDilation)
+create_test_cudnn_fp16_class(Test_NHWC)
+
+
+# ----------------Conv3DTransposeCUDNN bf16----------------
+create_test_cudnn_bf16_class(TestConv3DTransposeOp)
+create_test_cudnn_bf16_class(TestWithSymmetricPad)
+create_test_cudnn_bf16_class(TestWithAsymmetricPad)
+create_test_cudnn_bf16_class(TestWithSAMEPad)
+create_test_cudnn_bf16_class(TestWithVALIDPad)
+create_test_cudnn_bf16_class(TestWithStride)
+create_test_cudnn_bf16_class(TestWithGroups)
+create_test_cudnn_bf16_class(TestWithDilation)
+create_test_cudnn_bf16_class(Test_NHWC)
+
+
+class TestConv3dTranspose(unittest.TestCase):
+    def error_weight_input(self):
+        array = np.array([1], dtype=np.float32)
+        x = paddle.to_tensor(np.reshape(array, [1, 1, 1, 1, 1]), dtype="float32")
+        weight = paddle.to_tensor(np.reshape(array, [1]), dtype="float32")
+        paddle.nn.functional.conv3d_transpose(x, weight, bias=0)
+
+    def test_type_error(self):
+        self.assertRaises(ValueError, self.error_weight_input)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_conv3d_transpose_part2_op_metax.py b/backends/metax_gpu/tests/unit_test/test_conv3d_transpose_part2_op_metax.py
new file mode 100644
index 00000000000..9bf91f5908f
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_conv3d_transpose_part2_op_metax.py
@@ -0,0 +1,108 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import unittest
+
+sys.path.append("../../legacy_test")
+from test_conv3d_transpose_op import (
+    TestConv3DTransposeOp,
+    create_test_cudnn_bf16_class,
+    create_test_cudnn_fp16_class,
+)
+
+from paddle.base import core
+
+core.set_cudnn_switch(False)
+
+
+class TestWithSymmetricPad_NHWC(TestConv3DTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [2, 5, 5, 5, 3]  # NDHWC
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+        self.data_format = "NHWC"
+
+
+class TestWithAsymmetricPad_NHWC(TestConv3DTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 0, 1, 0, 1, 2]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [2, 5, 5, 5, 3]  # NDHWC
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+        self.data_format = "NHWC"
+
+
+class TestWithGroups_NHWC(TestConv3DTransposeOp):
+    def init_test_case(self):
+        self.check_no_filter = True
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 2
+        self.input_size = [2, 5, 5, 5, 4]  # NDHWC
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 3, 3, 3, 3]
+        self.data_format = "NHWC"
+
+
+class TestWithStride_NHWC(TestConv3DTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [2, 2, 2]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [2, 5, 5, 5, 3]  # NCDHW
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+        self.data_format = "NHWC"
+
+
+class TestWithDilation_NHWC(TestConv3DTransposeOp):
+    def init_test_case(self):
+        self.check_no_input = True
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [2, 2, 2]
+        self.groups = 1
+        self.input_size = [2, 5, 5, 5, 3]  # NCDHW
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+        self.data_format = "NHWC"
+
+
+# ----------------Conv3DTransposeCUDNN fp16----------------
+create_test_cudnn_fp16_class(TestWithSymmetricPad_NHWC)
+create_test_cudnn_fp16_class(TestWithAsymmetricPad_NHWC)
+create_test_cudnn_fp16_class(TestWithGroups_NHWC)
+create_test_cudnn_fp16_class(TestWithStride_NHWC)
+create_test_cudnn_fp16_class(TestWithDilation_NHWC)
+
+
+# ----------------Conv3DTransposeCUDNN bf16----------------
+create_test_cudnn_bf16_class(TestWithSymmetricPad_NHWC)
+create_test_cudnn_bf16_class(TestWithAsymmetricPad_NHWC)
+create_test_cudnn_bf16_class(TestWithGroups_NHWC)
+create_test_cudnn_bf16_class(TestWithStride_NHWC)
+create_test_cudnn_bf16_class(TestWithDilation_NHWC)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_deform_conv2d_metax.py b/backends/metax_gpu/tests/unit_test/test_deform_conv2d_metax.py
new file mode 100644
index 00000000000..da5eeb34d0b
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_deform_conv2d_metax.py
@@ -0,0 +1,323 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+from unittest import TestCase
+
+import numpy as np
+from op_test import get_device_place, is_custom_device
+
+import paddle
+import paddle.nn.initializer as I
+
+from paddle.base import core
+
+core.set_cublas_switch(False)
+
+
+class TestDeformConv2D(TestCase):
+    batch_size = 4
+    spatial_shape = (5, 5)
+    dtype = "float32"
+
+    def setUp(self):
+        self.in_channels = 2
+        self.out_channels = 5
+        self.kernel_size = [3, 3]
+        self.padding = [0, 0]
+        self.stride = [1, 1]
+        self.dilation = [1, 1]
+        self.deformable_groups = 1
+        self.groups = 1
+        self.no_bias = True
+
+    def prepare(self):
+        np.random.seed(1)
+        paddle.seed(1)
+        if isinstance(self.kernel_size, int):
+            filter_shape = (self.kernel_size,) * 2
+        else:
+            filter_shape = tuple(self.kernel_size)
+        self.filter_shape = filter_shape
+
+        self.weight = np.random.uniform(
+            -1,
+            1,
+            (self.out_channels, self.in_channels // self.groups, *filter_shape),
+        ).astype(self.dtype)
+        if not self.no_bias:
+            self.bias = np.random.uniform(-1, 1, (self.out_channels,)).astype(
+                self.dtype
+            )
+
+        def out_size(in_size, pad_size, dilation_size, kernel_size, stride_size):
+            return (
+                in_size + 2 * pad_size - (dilation_size * (kernel_size - 1) + 1)
+            ) / stride_size + 1
+
+        out_h = int(
+            out_size(
+                self.spatial_shape[0],
+                self.padding[0],
+                self.dilation[0],
+                self.kernel_size[0],
+                self.stride[0],
+            )
+        )
+        out_w = int(
+            out_size(
+                self.spatial_shape[1],
+                self.padding[1],
+                self.dilation[1],
+                self.kernel_size[1],
+                self.stride[1],
+            )
+        )
+        out_shape = (out_h, out_w)
+
+        self.input_shape = (
+            self.batch_size,
+            self.in_channels,
+            *self.spatial_shape,
+        )
+
+        self.offset_shape = (
+            self.batch_size,
+            self.deformable_groups * 2 * filter_shape[0] * filter_shape[1],
+            *out_shape,
+        )
+
+        self.mask_shape = (
+            self.batch_size,
+            self.deformable_groups * filter_shape[0] * filter_shape[1],
+            *out_shape,
+        )
+
+        self.input = np.random.uniform(-1, 1, self.input_shape).astype(self.dtype)
+
+        self.offset = np.random.uniform(-1, 1, self.offset_shape).astype(self.dtype)
+
+        self.mask = np.random.uniform(-1, 1, self.mask_shape).astype(self.dtype)
+
+    def static_graph_case_dcn(self):
+        main = paddle.static.Program()
+        start = paddle.static.Program()
+        paddle.enable_static()
+        with paddle.static.program_guard(main, start):
+            x = paddle.static.data(
+                "input", (-1, self.in_channels, -1, -1), dtype=self.dtype
+            )
+            offset = paddle.static.data(
+                "offset",
+                (
+                    -1,
+                    self.deformable_groups
+                    * 2
+                    * self.filter_shape[0]
+                    * self.filter_shape[1],
+                    -1,
+                    -1,
+                ),
+                dtype=self.dtype,
+            )
+            mask = paddle.static.data(
+                "mask",
+                (
+                    -1,
+                    self.deformable_groups
+                    * self.filter_shape[0]
+                    * self.filter_shape[1],
+                    -1,
+                    -1,
+                ),
+                dtype=self.dtype,
+            )
+
+            y_v1 = paddle.vision.ops.DeformConv2D(
+                in_channels=self.in_channels,
+                out_channels=self.out_channels,
+                kernel_size=self.filter_shape,
+                stride=self.stride,
+                padding=self.padding,
+                dilation=self.dilation,
+                groups=self.groups,
+                deformable_groups=self.deformable_groups,
+                weight_attr=I.Assign(self.weight),
+                bias_attr=False if self.no_bias else I.Assign(self.bias),
+            )(x, offset, None)
+
+            y_v2 = paddle.vision.ops.DeformConv2D(
+                in_channels=self.in_channels,
+                out_channels=self.out_channels,
+                kernel_size=self.filter_shape,
+                stride=self.stride,
+                padding=self.padding,
+                dilation=self.dilation,
+                groups=self.groups,
+                deformable_groups=self.deformable_groups,
+                weight_attr=I.Assign(self.weight),
+                bias_attr=False if self.no_bias else I.Assign(self.bias),
+            )(x, offset, mask)
+
+        exe = paddle.static.Executor(self.place)
+        exe.run(start)
+        out_v1, out_v2 = exe.run(
+            main,
+            feed={
+                "input": self.input,
+                "offset": self.offset,
+                "mask": self.mask,
+            },
+            fetch_list=[y_v1, y_v2],
+        )
+        return out_v1, out_v2
+
+    def dygraph_case_dcn(self):
+        paddle.disable_static()
+        x = paddle.to_tensor(self.input)
+        offset = paddle.to_tensor(self.offset)
+        mask = paddle.to_tensor(self.mask)
+
+        bias = None if self.no_bias else paddle.to_tensor(self.bias)
+
+        deform_conv2d = paddle.vision.ops.DeformConv2D(
+            in_channels=self.in_channels,
+            out_channels=self.out_channels,
+            kernel_size=self.kernel_size,
+            stride=self.stride,
+            padding=self.padding,
+            dilation=self.dilation,
+            deformable_groups=self.deformable_groups,
+            groups=self.groups,
+            weight_attr=I.Assign(self.weight),
+            bias_attr=False if self.no_bias else I.Assign(self.bias),
+        )
+
+        y_v1 = deform_conv2d(x, offset)
+        y_v2 = deform_conv2d(x, offset, mask)
+
+        out_v1 = y_v1.numpy()
+        out_v2 = y_v2.numpy()
+
+        return out_v1, out_v2
+
+    def _test_identity(self):
+        self.prepare()
+        static_dcn_v1, static_dcn_v2 = self.static_graph_case_dcn()
+        dy_dcn_v1, dy_dcn_v2 = self.dygraph_case_dcn()
+        np.testing.assert_array_almost_equal(static_dcn_v1, dy_dcn_v1)
+        np.testing.assert_array_almost_equal(static_dcn_v2, dy_dcn_v2)
+
+    def test_identity(self):
+        self.place = paddle.CPUPlace()
+        self._test_identity()
+
+        if paddle.is_compiled_with_cuda() or is_custom_device():
+            self.place = get_device_place()
+            self._test_identity()
+
+
+# testcases for DeformConv2D
+class TestDeformConv2DWithPadding(TestDeformConv2D):
+    def setUp(self):
+        self.in_channels = 3
+        self.out_channels = 5
+        self.kernel_size = [3, 3]
+        self.padding = [2, 2]
+        self.stride = [1, 1]
+        self.dilation = [1, 1]
+        self.deformable_groups = 1
+        self.groups = 1
+        self.no_bias = True
+
+
+class TestDeformConv2DWithBias(TestDeformConv2D):
+    def setUp(self):
+        self.in_channels = 3
+        self.out_channels = 5
+        self.kernel_size = [3, 3]
+        self.padding = [2, 2]
+        self.stride = [1, 1]
+        self.dilation = [1, 1]
+        self.deformable_groups = 1
+        self.groups = 1
+        self.no_bias = False
+
+
+class TestDeformConv2DWithAsynPadding(TestDeformConv2D):
+    def setUp(self):
+        self.in_channels = 3
+        self.out_channels = 5
+        self.kernel_size = [3, 3]
+        self.padding = [1, 2]
+        self.stride = [1, 1]
+        self.dilation = [1, 1]
+        self.deformable_groups = 1
+        self.groups = 1
+        self.no_bias = False
+
+
+class TestDeformConv2DWithDilation(TestDeformConv2D):
+    def setUp(self):
+        self.in_channels = 3
+        self.out_channels = 5
+        self.kernel_size = [3, 3]
+        self.padding = [1, 1]
+        self.stride = [1, 1]
+        self.dilation = [3, 3]
+        self.deformable_groups = 1
+        self.groups = 1
+        self.no_bias = False
+
+
+class TestDeformConv2DWithStride(TestDeformConv2D):
+    def setUp(self):
+        self.in_channels = 3
+        self.out_channels = 5
+        self.kernel_size = [3, 3]
+        self.padding = [1, 1]
+        self.stride = [2, 2]
+        self.dilation = [1, 1]
+        self.deformable_groups = 1
+        self.groups = 1
+        self.no_bias = False
+
+
+class TestDeformConv2DWithDeformable_Groups(TestDeformConv2D):
+    def setUp(self):
+        self.in_channels = 5
+        self.out_channels = 5
+        self.kernel_size = [3, 3]
+        self.padding = [1, 1]
+        self.stride = [1, 1]
+        self.dilation = [1, 1]
+        self.deformable_groups = 5
+        self.groups = 1
+        self.no_bias = False
+
+
+class TestDeformConv2DWithGroups(TestDeformConv2D):
+    def setUp(self):
+        self.in_channels = 5
+        self.out_channels = 5
+        self.kernel_size = [3, 3]
+        self.padding = [1, 1]
+        self.stride = [1, 1]
+        self.dilation = [1, 1]
+        self.deformable_groups = 1
+        self.groups = 5
+        self.no_bias = False
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_deformable_conv_op_metax.py b/backends/metax_gpu/tests/unit_test/test_deformable_conv_op_metax.py
new file mode 100644
index 00000000000..1f26abb73f8
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_deformable_conv_op_metax.py
@@ -0,0 +1,504 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from itertools import product
+
+import numpy as np
+from op_test import OpTest
+
+import paddle
+
+paddle.enable_static()
+
+from paddle.base import core
+
+core.set_cublas_switch(False)
+
+
+def dmc_bilinear(data_im, height, width, h, w):
+    h_low = int(np.floor(h))
+    w_low = int(np.floor(w))
+    h_high = h_low + 1
+    w_high = w_low + 1
+
+    lh = h - h_low
+    lw = w - w_low
+    hh = 1 - lh
+    hw = 1 - lw
+
+    v1 = 0
+    if h_low >= 0 and w_low >= 0:
+        v1 = data_im[h_low, w_low]
+    v2 = 0
+    if h_low >= 0 and w_high <= width - 1:
+        v2 = data_im[h_low, w_high]
+    v3 = 0
+    if h_high <= height - 1 and w_low >= 0:
+        v3 = data_im[h_high, w_low]
+    v4 = 0
+    if h_high <= height - 1 and w_high <= width - 1:
+        v4 = data_im[h_high, w_high]
+
+    w1, w2, w3, w4 = hh * hw, hh * lw, lh * hw, lh * lw
+    val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4
+
+    return val
+
+
+def dconv_im2col_gemm(input, offset, mask, filter, group, conv_param):
+    in_n, in_c, in_h, in_w = input.shape
+    out_c, f_c, f_h, f_w = filter.shape
+
+    assert offset.shape == (in_n, 2 * f_h * f_w, in_h, in_w)
+    assert mask.shape == (in_n, f_h * f_w, in_h, in_w)
+    assert f_c * group == in_c
+    assert np.mod(out_c, group) == 0
+
+    stride, pad, dilation = (
+        conv_param["stride"],
+        conv_param["pad"],
+        conv_param["dilation"],
+    )
+    out_h = 1 + (in_h + 2 * pad[0] - (dilation[0] * (f_h - 1) + 1)) // stride[0]
+    out_w = 1 + (in_w + 2 * pad[1] - (dilation[1] * (f_w - 1) + 1)) // stride[1]
+    assert out_h == in_h
+    assert out_w == in_w
+
+    col_buffer = np.zeros((in_n, in_c * f_h * f_w, in_h * in_w))
+    for n, c, h, w, kh, kw in product(
+        range(in_n),
+        range(in_c),
+        range(out_h),
+        range(out_w),
+        range(f_h),
+        range(f_w),
+    ):
+        offset_h_table = offset[n, ::2, h, w].reshape(f_h, f_w)
+        offset_w_table = offset[n, 1::2, h, w].reshape(f_h, f_w)
+        mask_table = mask[n, :, h, w].reshape(f_h, f_w)
+        offset_h = offset_h_table[kh, kw]
+        offset_w = offset_w_table[kh, kw]
+        val = 0
+        im_h = h * stride[0] + kh * dilation[0] + offset_h - pad[0]
+        im_w = w * stride[0] + kw * dilation[0] + offset_w - pad[1]
+        if im_h > -1 and im_w > -1 and im_h < in_h and im_w < in_h:
+            val = dmc_bilinear(input[n, c], in_h, in_w, im_h, im_w)
+        val_out = val * mask_table[kh, kw]
+        col_buffer[n, c * f_h * f_w + kh * f_w + kw, h * in_w + w] = val_out
+
+    out = np.zeros((in_n, group, int(out_c // group), out_h * out_w))
+    weight = filter.reshape(group, int(out_c // group), f_c * f_h * f_w)
+    col_buffer = col_buffer.reshape(
+        (in_n, group, int(in_c // group * f_h * f_w), in_h * in_w)
+    )
+    for n in range(in_n):
+        for g in range(group):
+            out[n, g] = np.matmul(weight[g], col_buffer[n, g])
+    out = out.reshape(in_n, out_c, out_h, out_w)
+    return out
+
+
+def deform_conv2d_wrapper(
+    x,
+    offset,
+    weight,
+    mask=None,
+    stride=1,
+    padding=0,
+    dilation=1,
+    deformable_groups=1,
+    groups=1,
+    im2col_step=1,
+):
+    return paddle.vision.ops.deform_conv2d(
+        x,
+        offset,
+        weight,
+        None,
+        stride,
+        padding,
+        dilation,
+        deformable_groups,
+        groups,
+        mask,
+    )
+
+
+class TestModulatedDeformableConvOp(OpTest):
+    def setUp(self):
+        self.python_api = deform_conv2d_wrapper
+        self.op_type = "deformable_conv"
+        self.init_type()
+        self.init_group()
+        self.init_dilation()
+        self.init_test_case()
+
+        conv_param = {
+            "stride": self.stride,
+            "pad": self.pad,
+            "dilation": self.dilations,
+        }
+
+        input = np.random.random(self.input_size).astype(self.dtype)
+        offset = 10 * np.random.random(self.offset_size).astype(self.dtype)
+        mask = 10 * np.random.random(self.mask_size).astype(self.dtype)
+        filter = np.random.random(self.filter_size).astype(self.dtype)
+
+        output = dconv_im2col_gemm(input, offset, mask, filter, self.groups, conv_param)
+        output = output.astype(self.dtype)
+
+        self.inputs = {
+            "Input": OpTest.np_dtype_to_base_dtype(input),
+            "Offset": OpTest.np_dtype_to_base_dtype(offset),
+            "Mask": OpTest.np_dtype_to_base_dtype(mask),
+            "Filter": OpTest.np_dtype_to_base_dtype(filter),
+        }
+        self.attrs = {
+            "strides": self.stride,
+            "paddings": self.pad,
+            "groups": self.groups,
+            "deformable_groups": self.deformable_groups,
+            "im2col_step": self.im2col_step,
+            "dilations": self.dilations,
+        }
+        self.outputs = {"Output": output}
+
+    def test_check_output(self):
+        self.check_output(check_pir=True)
+
+    def test_check_grad(self):
+        self.check_grad(
+            {"Input", "Offset", "Mask", "Filter"},
+            "Output",
+            max_relative_error=0.05,
+            check_pir=True,
+        )
+
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        self.input_size = [2, 8, 4, 4]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [4, f_c, 3, 3]
+        self.im2col_step = 1
+        self.deformable_groups = 1
+        offset_c = (
+            2 * self.deformable_groups * self.filter_size[2] * self.filter_size[3]
+        )
+        mask_c = self.deformable_groups * self.filter_size[2] * self.filter_size[3]
+        self.offset_size = [
+            self.input_size[0],
+            offset_c,
+            self.input_size[2],
+            self.input_size[3],
+        ]
+        self.mask_size = [
+            self.input_size[0],
+            mask_c,
+            self.input_size[2],
+            self.input_size[3],
+        ]
+
+    def init_dilation(self):
+        self.dilations = [1, 1]
+
+    def init_group(self):
+        self.groups = 1
+
+    def init_type(self):
+        self.dtype = np.float32
+
+
+class TestWithStride(TestModulatedDeformableConvOp):
+    def init_test_case(self):
+        self.pad = [3, 3]
+        self.stride = [2, 2]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+        self.im2col_step = 1
+        self.deformable_groups = 1
+        offset_c = (
+            2 * self.deformable_groups * self.filter_size[2] * self.filter_size[3]
+        )
+        mask_c = self.deformable_groups * self.filter_size[2] * self.filter_size[3]
+        self.offset_size = [
+            self.input_size[0],
+            offset_c,
+            self.input_size[2],
+            self.input_size[3],
+        ]
+        self.mask_size = [
+            self.input_size[0],
+            mask_c,
+            self.input_size[2],
+            self.input_size[3],
+        ]
+
+
+class TestWithDilation(TestModulatedDeformableConvOp):
+    def init_test_case(self):
+        self.pad = [2, 2]
+        self.stride = [1, 1]
+        self.input_size = [4, 3, 4, 4]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+        self.im2col_step = 1
+        self.deformable_groups = 1
+        offset_c = (
+            2 * self.deformable_groups * self.filter_size[2] * self.filter_size[3]
+        )
+        mask_c = self.deformable_groups * self.filter_size[2] * self.filter_size[3]
+        self.offset_size = [
+            self.input_size[0],
+            offset_c,
+            self.input_size[2],
+            self.input_size[3],
+        ]
+        self.mask_size = [
+            self.input_size[0],
+            mask_c,
+            self.input_size[2],
+            self.input_size[3],
+        ]
+
+    def init_dilation(self):
+        self.dilations = [2, 2]
+
+
+class TestWith3x3(TestModulatedDeformableConvOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+        self.im2col_step = 1
+        self.deformable_groups = 1
+        offset_c = (
+            2 * self.deformable_groups * self.filter_size[2] * self.filter_size[3]
+        )
+        mask_c = self.deformable_groups * self.filter_size[2] * self.filter_size[3]
+        self.offset_size = [
+            self.input_size[0],
+            offset_c,
+            self.input_size[2],
+            self.input_size[3],
+        ]
+        self.mask_size = [
+            self.input_size[0],
+            mask_c,
+            self.input_size[2],
+            self.input_size[3],
+        ]
+
+
+class TestWithGroup(TestModulatedDeformableConvOp):
+    def init_group(self):
+        self.groups = 2
+
+
+class TestWithDouble(TestModulatedDeformableConvOp):
+    def init_type(self):
+        self.dtype = np.float64
+
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        self.input_size = [2, 6, 4, 4]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [4, f_c, 3, 3]
+        self.im2col_step = 1
+        self.deformable_groups = 1
+        offset_c = (
+            2 * self.deformable_groups * self.filter_size[2] * self.filter_size[3]
+        )
+        mask_c = self.deformable_groups * self.filter_size[2] * self.filter_size[3]
+        self.offset_size = [
+            self.input_size[0],
+            offset_c,
+            self.input_size[2],
+            self.input_size[3],
+        ]
+        self.mask_size = [
+            self.input_size[0],
+            mask_c,
+            self.input_size[2],
+            self.input_size[3],
+        ]
+
+
+class TestModulatedDeformableConvInvalidInput(unittest.TestCase):
+    def test_error_api(self):
+        def test_invalid_input():
+            paddle.enable_static()
+            input = [1, 3, 32, 32]
+            offset = paddle.static.data(
+                name="offset", shape=[None, 3, 32, 32], dtype="float32"
+            )
+            mask = paddle.static.data(
+                name="mask", shape=[None, 3, 32, 32], dtype="float32"
+            )
+            loss = paddle.vision.ops.DeformConv2D(
+                in_channels=input[1], out_channels=4, kernel_size=1
+            )(input, offset, mask)
+
+        self.assertRaises(TypeError, test_invalid_input)
+
+        def test_invalid_offset():
+            paddle.enable_static()
+            input = paddle.static.data(
+                name="input", shape=[None, 3, 32, 32], dtype="int32"
+            )
+            offset = paddle.static.data(
+                name="offset", shape=[None, 3, 32, 32], dtype="float32"
+            )
+            mask = paddle.static.data(
+                name="mask", shape=[None, 3, 32, 32], dtype="float32"
+            )
+            loss = paddle.vision.ops.DeformConv2D(
+                in_channels=input.shape[1], out_channels=4, kernel_size=1
+            )(input, offset, mask)
+
+        self.assertRaises(TypeError, test_invalid_offset)
+
+        def test_invalid_groups():
+            paddle.enable_static()
+            input = paddle.static.data(
+                name="input_groups", shape=[1, 1, 1, 1], dtype="float32"
+            )
+            offset = paddle.static.data(
+                name="offset_groups", shape=[1, 1], dtype="float32"
+            )
+            mask = paddle.static.data(name="mask_groups", shape=[1], dtype="float32")
+            loss = paddle.vision.ops.DeformConv2D(
+                in_channels=input.shape[1],
+                out_channels=1,
+                kernel_size=1,
+                padding=1,
+                groups=0,
+            )(input, offset, mask)
+
+        self.assertRaises(ZeroDivisionError, test_invalid_groups)
+
+
+class TestDeformConv2DAPI(unittest.TestCase):
+    def test_api(self):
+        def test_deform_conv2d_v1():
+            paddle.enable_static()
+            input = paddle.static.data(
+                name="input_v1", shape=[None, 3, 32, 32], dtype="float32"
+            )
+            offset = paddle.static.data(
+                name="offset_v1", shape=[None, 4, 32, 32], dtype="float32"
+            )
+            out = paddle.vision.ops.DeformConv2D(
+                in_channels=input.shape[1], out_channels=4, kernel_size=1
+            )(input, offset, None)
+            assert tuple(out.shape) == (-1, 4, 32, 32)
+
+        test_deform_conv2d_v1()
+
+        def test_deform_conv2d_v2():
+            paddle.enable_static()
+            input = paddle.static.data(
+                name="input_v2", shape=[None, 3, 32, 32], dtype="float32"
+            )
+            offset = paddle.static.data(
+                name="offset_v2", shape=[None, 4, 32, 32], dtype="float32"
+            )
+            mask = paddle.static.data(
+                name="mask_v2", shape=[None, 2, 32, 32], dtype="float32"
+            )
+            out = paddle.vision.ops.DeformConv2D(
+                in_channels=input.shape[1], out_channels=4, kernel_size=1
+            )(input, offset, mask)
+
+            assert tuple(out.shape) == (-1, 4, 32, 32)
+
+        test_deform_conv2d_v2()
+
+
+class TestModulatedDeformableConvOp_ZeroSize(TestModulatedDeformableConvOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        # 0-size
+        self.input_size = [0, 8, 4, 4]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [4, f_c, 3, 3]
+        self.im2col_step = 1
+        self.deformable_groups = 1
+        offset_c = (
+            2 * self.deformable_groups * self.filter_size[2] * self.filter_size[3]
+        )
+        mask_c = self.deformable_groups * self.filter_size[2] * self.filter_size[3]
+        self.offset_size = [
+            self.input_size[0],
+            offset_c,
+            self.input_size[2],
+            self.input_size[3],
+        ]
+        self.mask_size = [
+            self.input_size[0],
+            mask_c,
+            self.input_size[2],
+            self.input_size[3],
+        ]
+
+
+class TestDeformConv2DAPI_CPU_FP16(unittest.TestCase):
+    def setUp(self):
+        self.padding = [1, 1]
+        self.stride = [1, 1]
+        self.dilation = [1, 1]
+        self.groups = 1
+        self.data_format = "NCL"
+
+    def test_cpu_fp16(self):
+        with paddle.base.dygraph.guard(paddle.CPUPlace()):
+            x = paddle.ones([4, 5, 5, 5])
+            offset = paddle.ones([4, 90, 5, 5]).astype(paddle.float16)
+            weight = paddle.ones([5, 5, 3, 3]).astype(paddle.float16)
+            bias = paddle.ones([5]).astype(paddle.float16)
+            mask = paddle.ones([4, 45, 5, 5]).astype(paddle.float16)
+
+            # If there is an error, an error will be thrown.
+            out = paddle.vision.ops.deform_conv2d(
+                x,
+                offset,
+                weight,
+                bias,
+                stride=self.stride,
+                padding=self.padding,
+                dilation=self.dilation,
+                groups=self.groups,
+                deformable_groups=5,
+                mask=mask,
+            )
+            np.testing.assert_allclose(out.shape, [4, 5, 5, 5])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_deformable_conv_v1_op_metax.py b/backends/metax_gpu/tests/unit_test/test_deformable_conv_v1_op_metax.py
new file mode 100644
index 00000000000..6a4244db267
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_deformable_conv_v1_op_metax.py
@@ -0,0 +1,319 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from itertools import product
+
+import numpy as np
+from op_test import OpTest
+
+import paddle
+
+from paddle.base import core
+
+core.set_cublas_switch(False)
+
+
+def dmc_bilinear(data_im, height, width, h, w):
+    h_low = int(np.floor(h))
+    w_low = int(np.floor(w))
+    h_high = h_low + 1
+    w_high = w_low + 1
+
+    lh = h - h_low
+    lw = w - w_low
+    hh = 1 - lh
+    hw = 1 - lw
+
+    v1 = 0
+    if h_low >= 0 and w_low >= 0:
+        v1 = data_im[h_low, w_low]
+    v2 = 0
+    if h_low >= 0 and w_high <= width - 1:
+        v2 = data_im[h_low, w_high]
+    v3 = 0
+    if h_high <= height - 1 and w_low >= 0:
+        v3 = data_im[h_high, w_low]
+    v4 = 0
+    if h_high <= height - 1 and w_high <= width - 1:
+        v4 = data_im[h_high, w_high]
+
+    w1, w2, w3, w4 = hh * hw, hh * lw, lh * hw, lh * lw
+    val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4
+
+    return val
+
+
+def dconv_im2col_gemm(input, offset, filter, group, conv_param):
+    in_n, in_c, in_h, in_w = input.shape
+    out_c, f_c, f_h, f_w = filter.shape
+
+    assert offset.shape == (in_n, 2 * f_h * f_w, in_h, in_w)
+    assert f_c * group == in_c
+    assert np.mod(out_c, group) == 0
+
+    stride, pad, dilation = (
+        conv_param["stride"],
+        conv_param["pad"],
+        conv_param["dilation"],
+    )
+    out_h = 1 + (in_h + 2 * pad[0] - (dilation[0] * (f_h - 1) + 1)) // stride[0]
+    out_w = 1 + (in_w + 2 * pad[1] - (dilation[1] * (f_w - 1) + 1)) // stride[1]
+    assert out_h == in_h
+    assert out_w == in_w
+
+    col_buffer = np.zeros((in_n, in_c * f_h * f_w, in_h * in_w))
+    for n, c, h, w, kh, kw in product(
+        range(in_n),
+        range(in_c),
+        range(out_h),
+        range(out_w),
+        range(f_h),
+        range(f_w),
+    ):
+        offset_h_table = offset[n, ::2, h, w].reshape(f_h, f_w)
+        offset_w_table = offset[n, 1::2, h, w].reshape(f_h, f_w)
+        offset_h = offset_h_table[kh, kw]
+        offset_w = offset_w_table[kh, kw]
+        val = 0
+        im_h = h * stride[0] + kh * dilation[0] + offset_h - pad[0]
+        im_w = w * stride[0] + kw * dilation[0] + offset_w - pad[1]
+        if im_h > -1 and im_w > -1 and im_h < in_h and im_w < in_h:
+            val = dmc_bilinear(input[n, c], in_h, in_w, im_h, im_w)
+        val_out = val
+
+        col_buffer[n, c * f_h * f_w + kh * f_w + kw, h * in_w + w] = val_out
+
+    out = np.zeros((in_n, group, int(out_c // group), out_h * out_w))
+    weight = filter.reshape(group, int(out_c // group), f_c * f_h * f_w)
+    col_buffer = col_buffer.reshape(
+        (in_n, group, int(in_c // group * f_h * f_w), in_h * in_w)
+    )
+    for n in range(in_n):
+        for g in range(group):
+            out[n, g] = np.matmul(weight[g], col_buffer[n, g])
+    out = out.reshape(in_n, out_c, out_h, out_w)
+    return out
+
+
+def deform_conv2d_wrapper(
+    x,
+    offset,
+    weight,
+    mask=None,
+    stride=1,
+    padding=0,
+    dilation=1,
+    deformable_groups=1,
+    groups=1,
+    im2col_step=1,
+):
+    return paddle.vision.ops.deform_conv2d(
+        x,
+        offset,
+        weight,
+        None,
+        stride,
+        padding,
+        dilation,
+        deformable_groups,
+        groups,
+        mask,
+    )
+
+
+class TestModulatedDeformableConvOp(OpTest):
+    def setUp(self):
+        self.python_api = deform_conv2d_wrapper
+        self.op_type = "deformable_conv_v1"
+        self.init_type()
+        self.init_group()
+        self.init_dilation()
+        self.init_test_case()
+
+        conv_param = {
+            "stride": self.stride,
+            "pad": self.pad,
+            "dilation": self.dilations,
+        }
+
+        input = np.random.random(self.input_size).astype(self.dtype)
+        offset = 10 * np.random.random(self.offset_size).astype(self.dtype)
+        filter = np.random.random(self.filter_size).astype(self.dtype)
+
+        output = dconv_im2col_gemm(input, offset, filter, self.groups, conv_param)
+        output = output.astype(self.dtype)
+        self.inputs = {
+            "Input": OpTest.np_dtype_to_base_dtype(input),
+            "Offset": OpTest.np_dtype_to_base_dtype(offset),
+            "Filter": OpTest.np_dtype_to_base_dtype(filter),
+        }
+        self.attrs = {
+            "strides": self.stride,
+            "paddings": self.pad,
+            "groups": self.groups,
+            "deformable_groups": self.deformable_groups,
+            "im2col_step": self.im2col_step,
+            "dilations": self.dilations,
+        }
+        self.outputs = {"Output": output}
+
+    def test_check_output(self):
+        self.check_output(check_pir=True)
+
+    def test_check_grad(self):
+        self.check_grad(
+            ["Input", "Offset", "Filter"],
+            "Output",
+            max_relative_error=0.05,
+            check_pir=True,
+        )
+
+    def test_check_grad_no_filter(self):
+        self.check_grad(
+            ["Input", "Offset"],
+            "Output",
+            max_relative_error=0.1,
+            no_grad_set={"Filter"},
+            check_pir=True,
+        )
+
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        self.input_size = [2, 4, 4, 4]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [4, f_c, 3, 3]
+        self.im2col_step = 1
+        self.deformable_groups = 1
+        offset_c = (
+            2 * self.deformable_groups * self.filter_size[2] * self.filter_size[3]
+        )
+        self.offset_size = [
+            self.input_size[0],
+            offset_c,
+            self.input_size[2],
+            self.input_size[3],
+        ]
+
+    def init_dilation(self):
+        self.dilations = [1, 1]
+
+    def init_group(self):
+        self.groups = 1
+
+    def init_type(self):
+        self.dtype = np.float32
+
+
+class TestWithStride(TestModulatedDeformableConvOp):
+    def init_test_case(self):
+        self.pad = [3, 3]
+        self.stride = [2, 2]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+        self.im2col_step = 1
+        self.deformable_groups = 1
+        offset_c = (
+            2 * self.deformable_groups * self.filter_size[2] * self.filter_size[3]
+        )
+        self.offset_size = [
+            self.input_size[0],
+            offset_c,
+            self.input_size[2],
+            self.input_size[3],
+        ]
+
+
+class TestWithDilation(TestModulatedDeformableConvOp):
+    def init_test_case(self):
+        self.pad = [2, 2]
+        self.stride = [1, 1]
+        self.input_size = [5, 3, 4, 4]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+        self.im2col_step = 1
+        self.deformable_groups = 1
+        offset_c = (
+            2 * self.deformable_groups * self.filter_size[2] * self.filter_size[3]
+        )
+        self.offset_size = [
+            self.input_size[0],
+            offset_c,
+            self.input_size[2],
+            self.input_size[3],
+        ]
+
+    def init_dilation(self):
+        self.dilations = [2, 2]
+
+
+class TestWith1x1(TestModulatedDeformableConvOp):
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [40, f_c, 1, 1]
+        self.im2col_step = 1
+        self.deformable_groups = 1
+        offset_c = (
+            2 * self.deformable_groups * self.filter_size[2] * self.filter_size[3]
+        )
+        self.offset_size = [
+            self.input_size[0],
+            offset_c,
+            self.input_size[2],
+            self.input_size[3],
+        ]
+
+
+class TestWithGroup(TestModulatedDeformableConvOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        self.input_size = [2, 8, 4, 4]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [4, f_c, 3, 3]
+        self.im2col_step = 1
+        self.deformable_groups = 1
+        offset_c = (
+            2 * self.deformable_groups * self.filter_size[2] * self.filter_size[3]
+        )
+        self.offset_size = [
+            self.input_size[0],
+            offset_c,
+            self.input_size[2],
+            self.input_size[3],
+        ]
+
+    def init_group(self):
+        self.groups = 2
+
+
+class TestWithDouble(TestModulatedDeformableConvOp):
+    def init_type(self):
+        self.dtype = np.float64
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_einsum_0d_tensor_metax.py b/backends/metax_gpu/tests/unit_test/test_einsum_0d_tensor_metax.py
new file mode 100644
index 00000000000..f3f3bb30e34
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_einsum_0d_tensor_metax.py
@@ -0,0 +1,201 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+
+import numpy as np
+from numpy.testing import assert_allclose
+
+import paddle
+
+from paddle.base import core
+
+core.set_cublas_switch(False)
+
+os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
+
+
+class Test0DCase0(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static()
+
+    def tearDown(self):
+        paddle.enable_static()
+
+    def test_func(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+        y = paddle.rand([])
+        y.stop_gradient = False
+        z = paddle.einsum("...,...->...", x, y)
+        assert_allclose(
+            z.numpy(),
+            np.einsum("...,...->...", x.numpy(), y.numpy()),
+            atol=1e-6,
+        )
+        z.mean().backward()
+        assert z.shape == []
+        assert x.grad.shape == []
+        assert y.grad.shape == []
+
+
+class Test0DCase1(Test0DCase0):
+    def test_func(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+        y = paddle.rand([2, 2])
+        y.stop_gradient = False
+        z = paddle.einsum("...,ij->...", x, y)
+        assert_allclose(
+            z.numpy(), np.einsum("...,ij->...", x.numpy(), y.numpy()), atol=1e-6
+        )
+        z.mean().backward()
+        assert z.shape == []
+        assert x.grad.shape == []
+        assert y.grad.shape == [2, 2]
+
+
+class Test0DCase2(Test0DCase0):
+    def test_func(self):
+        x = paddle.rand([2, 2])
+        x.stop_gradient = False
+        y = paddle.rand([2, 2])
+        y.stop_gradient = False
+        z = paddle.einsum("ij,ij->", x, y)
+        assert_allclose(
+            z.numpy(), np.einsum("ij,ij->", x.numpy(), y.numpy()), atol=1e-6
+        )
+        z.mean().backward()
+        assert z.shape == []
+        assert x.grad.shape == [2, 2]
+        assert y.grad.shape == [2, 2]
+
+
+class Test0DCase3(Test0DCase0):
+    def test_func(self):
+        x = paddle.rand([2, 2])
+        x.stop_gradient = True
+        y = paddle.rand([2, 2])
+        y.stop_gradient = False
+        z = paddle.einsum("ij,ij->", x, y)
+        assert_allclose(
+            z.numpy(), np.einsum("ij,ij->", x.numpy(), y.numpy()), atol=1e-6
+        )
+        z.mean().backward()
+        assert z.shape == []
+        assert x.grad is None
+        assert y.grad.shape == [2, 2]
+
+
+class Test0DCase4(Test0DCase0):
+    def test_func(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+        z = paddle.einsum("...->...", x)
+        assert_allclose(z.numpy(), np.einsum("...->...", x.numpy()), atol=1e-6)
+        z.mean().backward()
+        assert z.shape == []
+        assert x.grad.shape == []
+        assert x.grad.numpy() == 1.0
+
+
+class Test0DCase5(Test0DCase0):
+    def test_func(self):
+        x = paddle.rand([2, 2])
+        x.stop_gradient = False
+        y = paddle.rand([2, 2])
+        y.stop_gradient = False
+        z = paddle.einsum("i...j, i...j->...", x, y)
+        assert_allclose(
+            z.numpy(),
+            np.einsum("i...j, i...j->...", x.numpy(), y.numpy()),
+            atol=1e-6,
+        )
+        z.mean().backward()
+        assert z.shape == []
+        assert x.grad.shape == [2, 2]
+        assert y.grad.shape == [2, 2]
+
+
+class Test0DCase6(Test0DCase0):
+    def test_func(self):
+        x = paddle.rand([2, 2])
+        x.stop_gradient = False
+        z = paddle.einsum("ij->", x)
+        assert_allclose(z.numpy(), np.einsum("ij->", x.numpy()), atol=1e-6)
+        z.mean().backward()
+        assert z.shape == []
+        assert x.grad.shape == [2, 2]
+
+
+class Test0DCase7(Test0DCase0):
+    def test_func(self):
+        """
+        3 operands.
+        """
+        x = paddle.rand([2, 2])
+        y = paddle.rand([])
+        z = paddle.rand([])
+        x.stop_gradient = False
+        y.stop_gradient = False
+        z.stop_gradient = False
+        o = paddle.einsum("ij...,...,...->...", x, y, z)
+        assert_allclose(
+            o.numpy(),
+            np.einsum("ij...,...,...->...", x.numpy(), y.numpy(), z.numpy()),
+            atol=1e-6,
+        )
+        o.mean().backward()
+        assert o.shape == []
+        assert x.grad.shape == [2, 2]
+        assert y.grad.shape == []
+        assert z.grad.shape == []
+
+
+class Test0DCase8(Test0DCase0):
+    def test_func(self):
+        """
+        3 operands.
+        """
+        x = paddle.rand([2, 2])
+        y = paddle.rand([])
+        z = paddle.rand([])
+        e = paddle.rand([3, 1])
+        x.stop_gradient = False
+        y.stop_gradient = False
+        z.stop_gradient = False
+        e.stop_gradient = False
+        o = paddle.einsum("ij...,...,..., km->...", x, y, z, e)
+        assert_allclose(
+            o.numpy(),
+            np.einsum(
+                "ij...,...,...,km->...",
+                x.numpy(),
+                y.numpy(),
+                z.numpy(),
+                e.numpy(),
+            ),
+            atol=1e-6,
+        )
+        o.mean().backward()
+        assert o.shape == []
+        assert x.grad.shape == [2, 2]
+        assert y.grad.shape == []
+        assert z.grad.shape == []
+        assert e.grad.shape == [3, 1]
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_fc_op_metax.py b/backends/metax_gpu/tests/unit_test/test_fc_op_metax.py
new file mode 100644
index 00000000000..67afd71c5f9
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_fc_op_metax.py
@@ -0,0 +1,138 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from op_test import OpTest
+
+from paddle.base import core
+
+core.set_cublas_switch(False)
+
+SEED = 2020
+
+
+def fc_refer(matrix, with_bias, with_relu=False):
+    in_n, in_c, in_h, in_w = matrix.input.shape
+    w_i, w_o = matrix.weights.shape
+
+    x_data = np.reshape(matrix.input, [in_n, in_c * in_h * in_w])
+    w_data = np.reshape(matrix.weights, [w_i, w_o])
+    b_data = np.reshape(matrix.bias, [1, w_o])
+    result = None
+
+    if with_bias:
+        result = np.dot(x_data, w_data) + b_data
+    else:
+        result = np.dot(x_data, w_data)
+
+    if with_relu:
+        return np.maximum(result, 0)
+    else:
+        return result
+
+
+class MatrixGenerate:
+    def __init__(self, mb, ic, oc, h, w, bias_dims=2):
+        self.input = np.random.random((mb, ic, h, w)).astype("float32")
+        self.weights = np.random.random((ic * h * w, oc)).astype("float32")
+        if bias_dims == 2:
+            self.bias = np.random.random((1, oc)).astype("float32")
+        else:
+            self.bias = np.random.random(oc).astype("float32")
+
+
+class TestFCOp(OpTest):
+    def config(self):
+        self.with_bias = True
+        self.with_relu = True
+        self.matrix = MatrixGenerate(1, 10, 15, 3, 3, 2)
+
+    def setUp(self):
+        self.op_type = "fc"
+        self.config()
+
+        if self.with_bias:
+            self.inputs = {
+                "Input": self.matrix.input,
+                "W": self.matrix.weights,
+                "Bias": self.matrix.bias,
+            }
+        else:
+            self.inputs = {"Input": self.matrix.input, "W": self.matrix.weights}
+
+        if self.with_relu:
+            activation_type = "relu"
+        else:
+            activation_type = ""
+        self.attrs = {"use_onednn": False, "activation_type": activation_type}
+
+        self.outputs = {"Out": fc_refer(self.matrix, self.with_bias, self.with_relu)}
+
+    def test_check_output(self):
+        self.check_output(check_dygraph=False)
+
+
+class TestFCOpNoBias1(TestFCOp):
+    def config(self):
+        self.with_bias = False
+        self.with_relu = False
+        self.matrix = MatrixGenerate(2, 8, 10, 1, 1, 2)
+
+
+class TestFCOpNoBias2(TestFCOp):
+    def config(self):
+        self.with_bias = False
+        self.with_relu = False
+        self.matrix = MatrixGenerate(4, 5, 6, 2, 2, 1)
+
+
+class TestFCOpNoBias4(TestFCOp):
+    def config(self):
+        self.with_bias = False
+        self.with_relu = False
+        self.matrix = MatrixGenerate(1, 32, 64, 3, 3, 1)
+
+
+class TestFCOpWithBias1(TestFCOp):
+    def config(self):
+        self.with_bias = True
+        self.with_relu = False
+        self.matrix = MatrixGenerate(3, 8, 10, 2, 1, 2)
+
+
+class TestFCOpWithBias2(TestFCOp):
+    def config(self):
+        self.with_bias = True
+        self.with_relu = True
+        self.matrix = MatrixGenerate(4, 5, 6, 2, 2, 1)
+
+
+class TestFCOpWithBias3(TestFCOp):
+    def config(self):
+        self.with_bias = True
+        self.with_relu = True
+        self.matrix = MatrixGenerate(1, 64, 32, 3, 3, 1)
+
+
+class TestFCOpWithPadding(TestFCOp):
+    def config(self):
+        self.with_bias = True
+        self.with_relu = True
+        self.matrix = MatrixGenerate(1, 4, 3, 128, 128, 2)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_imperative_double_grad_metax.py b/backends/metax_gpu/tests/unit_test/test_imperative_double_grad_metax.py
new file mode 100644
index 00000000000..803b00cc6b4
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_imperative_double_grad_metax.py
@@ -0,0 +1,1106 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+from unittest import TestCase
+
+import numpy as np
+from op_test import get_device, is_custom_device
+
+import paddle
+import paddle.nn.functional as F
+from paddle import base
+from paddle.base.wrapped_decorator import wrap_decorator
+from paddle.vision.models import resnet50, resnet101
+
+from paddle.base import core
+
+core.set_cudnn_switch(False)
+
+core.set_cublas_switch(False)
+
+
+def _dygraph_guard_(func):
+    def __impl__(*args, **kwargs):
+        if base.in_dygraph_mode():
+            return func(*args, **kwargs)
+        else:
+            with base.dygraph.guard():
+                return func(*args, **kwargs)
+
+    return __impl__
+
+
+dygraph_guard = wrap_decorator(_dygraph_guard_)
+
+
+def random_var(size, low=-1, high=1, dtype="float32"):
+    x_np = np.random.uniform(low=low, high=high, size=size).astype(dtype)
+    return paddle.to_tensor(x_np)
+
+
+class TestEagerGrad(TestCase):
+    def test_simple_example_eager_grad(self):
+        np.random.seed(2021)
+        paddle.set_device("cpu")
+        np_x = np.random.random((3, 3))
+        np_y = np.random.random((3, 1))
+        x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False)
+        y = paddle.to_tensor(np_y, dtype="float64", stop_gradient=False)
+        out = paddle.matmul(x, y)
+        dx = base.dygraph.grad(out, x)
+
+        dout = np.ones_like(np_y)
+        expected_dx = np.matmul(dout, np.transpose(np_y))
+
+        # stop_gradient = !create_graph, create_graph default false
+        self.assertEqual(dx[0].stop_gradient, True)
+        np.testing.assert_allclose(dx[0].numpy(), expected_dx, rtol=1e-05)
+
+    def test_simple_example_eager_grad_allow_unused(self):
+        np.random.seed(2021)
+        paddle.set_device("cpu")
+        np_x = np.random.random((3, 3))
+        np_y = np.random.random((3, 1))
+        np_z = np.random.random((3, 1))
+        x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False)
+        y = paddle.to_tensor(np_y, dtype="float64", stop_gradient=False)
+        z = paddle.to_tensor(np_z, dtype="float64", stop_gradient=False)
+        out_z = paddle.nn.functional.sigmoid(z)
+        out = paddle.matmul(x, y)
+
+        dx = base.dygraph.grad(out, [x, z], allow_unused=True)
+        dout = np.ones_like(np_y)
+        expected_dx = np.matmul(dout, np.transpose(np_y))
+        np.testing.assert_allclose(dx[0].numpy(), expected_dx, rtol=1e-05)
+        # stop_gradient = !create_graph, create_graph default false
+        self.assertEqual(dx[0].stop_gradient, True)
+        # x is unused input in the graph
+        self.assertIsNone(dx[1])
+
+    def test_simple_example_eager_grad_not_allow_unused(self):
+        np.random.seed(2021)
+        paddle.set_device("cpu")
+        np_x = np.random.random((3, 3))
+        np_y = np.random.random((3, 1))
+        np_z = np.random.random((3, 1))
+        x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False)
+        y = paddle.to_tensor(np_y, dtype="float64", stop_gradient=False)
+        z = paddle.to_tensor(np_z, dtype="float64", stop_gradient=False)
+        out_z = paddle.nn.functional.sigmoid(z)
+        out = paddle.matmul(x, y)
+
+        try:
+            # allow_unused is false in default
+            dx = base.dygraph.grad(out, [x, z])
+        except ValueError as e:
+            error_msg = str(e)
+            assert error_msg.find("allow_unused") > 0
+
+    def test_simple_example_eager_grad_duplicate_input(self):
+        np.random.seed(2021)
+        paddle.set_device("cpu")
+        np_x = np.random.random((3, 3))
+        np_y = np.random.random((3, 1))
+        np_z = np.random.random((3, 1))
+        x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False)
+        y = paddle.to_tensor(np_y, dtype="float64", stop_gradient=False)
+        z = paddle.to_tensor(np_z, dtype="float64", stop_gradient=False)
+        out_z = paddle.nn.functional.sigmoid(z)
+        out = paddle.matmul(x, y)
+
+        try:
+            # duplicate input will arise RuntimeError errors
+            dx = base.dygraph.grad(out, [x, x])
+        except RuntimeError as e:
+            error_msg = str(e)
+            assert error_msg.find("duplicate") > 0
+
+    def test_simple_example_eager_grad_duplicate_output(self):
+        np.random.seed(2021)
+        paddle.set_device("cpu")
+        np_x = np.random.random((3, 3))
+        np_y = np.random.random((3, 1))
+        np_z = np.random.random((3, 1))
+        x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False)
+        y = paddle.to_tensor(np_y, dtype="float64", stop_gradient=False)
+        z = paddle.to_tensor(np_z, dtype="float64", stop_gradient=False)
+        out_z = paddle.nn.functional.sigmoid(z)
+        out = paddle.matmul(x, y)
+
+        try:
+            # duplicate output will arise RuntimeError errors
+            dx = base.dygraph.grad([out, out], [x])
+        except RuntimeError as e:
+            error_msg = str(e)
+            assert error_msg.find("duplicate") > 0
+
+    def test_simple_example_eager_two_grad_output(self):
+        x1 = paddle.to_tensor([1.0, 2.0])
+        x1.stop_gradient = False
+        x2 = paddle.to_tensor([1.0, 2.0])
+        x2.stop_gradient = False
+        out1 = x1 * 2
+        out2 = x2 * 2
+
+        dout2_record_by_hook = []
+
+        def record_hook(grad):
+            dout2_record_by_hook.append(grad)
+
+        out2.register_hook(record_hook)
+
+        out3 = paddle.multiply(out1, out2)
+        out4 = paddle.mean(out3)
+        egr_dout2, egr_dout3 = paddle.grad([out4], [out2, out3])
+
+        np.testing.assert_array_equal(
+            dout2_record_by_hook[0].numpy(), np.array([1.0, 2.0])
+        )
+
+        x1 = paddle.to_tensor([1.0, 2.0])
+        x1.stop_gradient = False
+        x2 = paddle.to_tensor([1.0, 2.0])
+        x2.stop_gradient = False
+        out1 = x1 * 2
+        out2 = x2 * 2
+
+        out3 = paddle.multiply(out1, out2)
+        out4 = paddle.mean(out3)
+        dout2, dout3 = paddle.grad([out4], [out2, out3])
+
+        self.assertEqual(dout2.stop_gradient, egr_dout2.stop_gradient)
+        self.assertEqual(dout3.stop_gradient, egr_dout3.stop_gradient)
+        np.testing.assert_array_equal(dout2.numpy(), egr_dout2.numpy())
+        np.testing.assert_array_equal(dout3.numpy(), egr_dout3.numpy())
+
+
+class TestDygraphDoubleGrad(TestCase):
+    def setUp(self):
+        self.sort_sum_gradient = False
+        self.shape = [5, 10]
+
+    def grad(
+        self,
+        outputs,
+        inputs,
+        grad_outputs=None,
+        no_grad_vars=None,
+        retain_graph=None,
+        create_graph=False,
+        allow_unused=False,
+    ):
+        base.set_flags({"FLAGS_sort_sum_gradient": self.sort_sum_gradient})
+        return base.dygraph.grad(
+            outputs=outputs,
+            inputs=inputs,
+            grad_outputs=grad_outputs,
+            no_grad_vars=no_grad_vars,
+            retain_graph=retain_graph,
+            create_graph=create_graph,
+            allow_unused=allow_unused,
+        )
+
+    @dygraph_guard
+    def test_exception(self):
+        with self.assertRaises(AssertionError):
+            self.grad(None, None)
+
+        shape = self.shape
+
+        with self.assertRaises(AssertionError):
+            self.grad(1, random_var(shape))
+
+        with self.assertRaises(AssertionError):
+            self.grad(random_var(shape), 1)
+
+        with self.assertRaises(AssertionError):
+            self.grad([1], [random_var(shape)])
+
+        with self.assertRaises(AssertionError):
+            self.grad([random_var(shape)], [1])
+
+        with self.assertRaises(AssertionError):
+            self.grad(
+                [random_var(shape), random_var(shape)],
+                [random_var(shape)],
+                [random_var(shape)],
+            )
+
+        with self.assertRaises(AssertionError):
+            self.grad([random_var(shape)], [random_var(shape)], no_grad_vars=[1])
+
+        with self.assertRaises(AssertionError):
+            self.grad([random_var(shape)], [random_var(shape)], no_grad_vars=1)
+
+    @dygraph_guard
+    def test_simple_example(self):
+        x = random_var(self.shape)
+        x.stop_gradient = False
+        y = x + 1
+
+        for create_graph in [False, True]:
+            (dx,) = self.grad([x], [x], create_graph=create_graph, retain_graph=True)
+            self.assertEqual(dx.shape, x.shape)
+            self.assertTrue(np.all(dx.numpy() == 1))
+            self.assertNotEqual(dx.stop_gradient, create_graph)
+
+            (dx_mul_2,) = self.grad(
+                [y, x], [x], create_graph=create_graph, retain_graph=True
+            )
+            self.assertEqual(dx_mul_2.shape, x.shape)
+            self.assertTrue(np.all(dx_mul_2.numpy() == 2))
+            self.assertNotEqual(dx_mul_2.stop_gradient, create_graph)
+
+            (none_grad,) = self.grad(
+                [x], [y], create_graph=create_graph, allow_unused=True
+            )
+            self.assertIsNone(none_grad)
+
+            (grad_with_none_and_not_none,) = self.grad(
+                [x, y], [y], create_graph=create_graph
+            )
+            self.assertTrue(grad_with_none_and_not_none.shape, x.shape)
+            self.assertTrue(np.all(grad_with_none_and_not_none.numpy() == 1))
+            self.assertNotEqual(grad_with_none_and_not_none.stop_gradient, create_graph)
+
+    @dygraph_guard
+    def test_example_no_grad_vars(self):
+        x = random_var(self.shape)
+        x_np = x.numpy()
+        numel = x_np.size
+        x.stop_gradient = False
+
+        y1 = F.relu(x)
+        y2 = F.relu(x)
+        z = y1 + y2
+        w = z * z
+
+        w_mean = paddle.mean(w)
+        del y1, z, w
+
+        (dx_actual,) = self.grad([w_mean], [x], create_graph=True, no_grad_vars=[y2])
+
+        self.assertFalse(y2.stop_gradient)
+        self.assertFalse(dx_actual.stop_gradient)
+
+        dx_expected = (
+            1.0 / float(numel) * (np.maximum(x_np, 0) + y2.numpy()) * (x_np > 0) * 2
+        ).astype("float32")
+
+        np.testing.assert_allclose(dx_actual.numpy(), dx_expected, rtol=1e-05)
+
+    @dygraph_guard
+    def test_none_one_initial_gradient(self):
+        numel = 1
+        for s in self.shape:
+            numel *= s
+
+        half_numel = int(numel / 2)
+        half_x_positive = np.random.uniform(low=1, high=2, size=[half_numel])
+        half_x_negative = np.random.uniform(low=-2, high=-1, size=[numel - half_numel])
+        x_np = np.array(list(half_x_positive) + list(half_x_negative)).astype("float32")
+        np.random.shuffle(x_np)
+
+        x = paddle.to_tensor(x_np)
+        x.stop_gradient = False
+
+        alpha = 0.2
+        y = paddle.nn.functional.leaky_relu(x, alpha)
+        y = y * y
+        z = y * y
+
+        x_np = x.numpy()
+        relu_x_np = np.maximum(x_np, alpha * x_np).astype("float32")
+        relu_x_grad_np = ((x_np > 0) + (x_np < 0) * alpha).astype("float32")
+        dy_expected = (relu_x_np * relu_x_grad_np * 2).astype("float32")
+        dz_expected = (np.power(relu_x_np, 3) * relu_x_grad_np * 4).astype("float32")
+
+        random_grad_y = random_var(y.shape, low=1, high=2)
+        random_grad_z = random_var(z.shape, low=1, high=2)
+        ones_grad_y = np.ones(y.shape).astype("float32")
+        ones_grad_z = np.ones(z.shape).astype("float32")
+
+        original_random_grad_y = random_grad_y.numpy()
+        original_random_grad_z = random_grad_z.numpy()
+
+        for grad_y in [random_grad_y]:
+            for grad_z in [random_grad_z]:
+                for create_graph in [False, True]:
+                    (dx_actual,) = self.grad(
+                        outputs=[y, z],
+                        inputs=[x],
+                        grad_outputs=[grad_y, grad_z],
+                        create_graph=create_graph,
+                        retain_graph=True,
+                    )
+
+                    grad_y_np = ones_grad_y if grad_y is None else grad_y.numpy()
+                    grad_z_np = ones_grad_z if grad_z is None else grad_z.numpy()
+
+                    dx_expected = dy_expected * grad_y_np + dz_expected * grad_z_np
+                    np.testing.assert_allclose(
+                        dx_actual.numpy(), dx_expected, rtol=1e-05
+                    )
+
+                    if grad_y is not None:
+                        self.assertTrue(grad_y.stop_gradient)
+                        np.testing.assert_array_equal(
+                            grad_y.numpy(), original_random_grad_y
+                        )
+
+                    if grad_z is not None:
+                        self.assertTrue(grad_z.stop_gradient)
+                        np.testing.assert_array_equal(
+                            grad_z.numpy(), original_random_grad_z
+                        )
+
+    @dygraph_guard
+    def test_example_with_gradient_accumulation_and_create_graph(self):
+        x = random_var(self.shape)
+        x_np = x.numpy()
+        numel = x_np.size
+        x.stop_gradient = False
+
+        y = F.relu(x)
+        z = y + 1
+        w = z * z
+
+        w_mean = paddle.mean(w)
+        del y, z, w
+
+        (dx_actual,) = self.grad([w_mean], [x], create_graph=True)
+        del w_mean
+
+        self.assertFalse(dx_actual.stop_gradient)
+
+        # Theoretical result based on math calculation
+        dx_expected = (
+            1.0 / float(numel) * (np.maximum(x_np, 0) + 1) * (x_np > 0) * 2
+        ).astype("float32")
+        np.testing.assert_allclose(dx_actual.numpy(), dx_expected, rtol=1e-05)
+
+        loss = paddle.mean(dx_actual * dx_actual + x * x)
+        loss.backward(retain_graph=True)
+
+        x_grad_actual = x.gradient()
+        x_grad_expected = (
+            2.0 / float(numel) * (x_np + dx_expected * (x_np > 0) * 2 / float(numel))
+        ).astype("float32")
+        np.testing.assert_allclose(x_grad_actual, x_grad_expected, rtol=1e-05)
+
+        for i in range(5):
+            loss.backward(retain_graph=True)
+            x_grad_actual = x.gradient()
+            x_grad_expected = (i + 2) * (
+                2.0
+                / float(numel)
+                * (x_np + dx_expected * (x_np > 0) * 2 / float(numel))
+            ).astype("float32")
+            np.testing.assert_allclose(x_grad_actual, x_grad_expected, rtol=1e-05)
+
+    @dygraph_guard
+    def test_example_with_gradient_accumulation_and_no_grad_vars(self):
+        x = random_var(self.shape)
+        x_np = x.numpy()
+        numel = x_np.size
+        x.stop_gradient = False
+
+        y1 = F.relu(x)
+        y2 = F.relu(x)
+        z = y1 + y2
+        w = z * z
+
+        w_mean = paddle.mean(w)
+        del y1, z, w
+
+        (dx_actual,) = self.grad(
+            [w_mean],
+            [x],
+            retain_graph=True,
+            create_graph=True,
+            no_grad_vars=[y2],
+        )
+
+        self.assertFalse(y2.stop_gradient)
+        self.assertFalse(dx_actual.stop_gradient)
+
+        dx_expected = (
+            1.0 / float(numel) * (np.maximum(x_np, 0) + y2.numpy()) * (x_np > 0) * 2
+        ).astype("float32")
+        np.testing.assert_allclose(dx_actual.numpy(), dx_expected, rtol=1e-05)
+
+        loss = paddle.mean(dx_actual * dx_actual + x * x)
+        loss.backward()
+
+        x_grad_actual = x.gradient()
+        x_grad_expected = (
+            2.0 / float(numel) * (x_np + dx_expected * (x_np > 0) * 4 / float(numel))
+        ).astype("float32")
+        np.testing.assert_allclose(x_grad_actual, x_grad_expected, rtol=1e-05)
+
+    @dygraph_guard
+    def test_example_with_gradient_accumulation_and_not_create_graph(self):
+        x = random_var(self.shape)
+        x_np = x.numpy()
+        numel = x_np.size
+        x.stop_gradient = False
+
+        y = F.relu(x)
+        z = y + 1
+        w = z * z
+
+        w_mean = paddle.mean(w)
+        del y, z, w
+
+        (dx_actual,) = self.grad([w_mean], [x], create_graph=False)
+        del w_mean
+
+        self.assertTrue(dx_actual.stop_gradient)
+
+        dx_expected = (
+            1.0 / float(numel) * (np.maximum(x_np, 0) + 1) * (x_np > 0) * 2
+        ).astype("float32")
+
+        np.testing.assert_allclose(dx_actual.numpy(), dx_expected, rtol=1e-05)
+
+        loss = paddle.mean(dx_actual * dx_actual + x * x)
+        loss.backward()
+
+        x_grad_actual = x.gradient()
+        x_grad_expected = (2.0 * x_np / float(numel)).astype("float32")
+        np.testing.assert_allclose(x_grad_actual, x_grad_expected, rtol=1e-05)
+
+
+class TestDygraphDoubleGradSortGradient(TestDygraphDoubleGrad):
+    def setUp(self):
+        self.sort_sum_gradient = True
+        self.shape = [5, 10]
+
+
+class TestDygraphDoubleGradVisitedUniq(TestCase):
+    def test_compare(self):
+        value = np.random.uniform(-0.5, 0.5, 100).reshape(10, 2, 5).astype("float32")
+
+        def model_f(input):
+            linear = paddle.nn.Linear(5, 3)
+            for i in range(10):
+                if i == 0:
+                    out = linear(input)
+                else:
+                    out = out + linear(input)
+            return out
+
+        base.set_flags({"FLAGS_sort_sum_gradient": True})
+
+        with base.dygraph.guard():
+            paddle.seed(123)
+            if paddle.framework.use_pir_api():
+                with paddle.pir_utils.OldIrGuard():
+                    # Note: dygraph use self.main_program.global_block().create_parameter(), it's need manual seed to old Program
+                    paddle.framework.random._manual_program_seed(123)
+                paddle.framework.random._manual_program_seed(123)
+            else:
+                paddle.framework.random._manual_program_seed(123)
+            a = paddle.to_tensor(value)
+            a.stop_gradient = False
+
+            out = model_f(a)
+
+            dx = base.dygraph.grad(
+                outputs=[out],
+                inputs=[a],
+                create_graph=False,
+                only_inputs=True,
+                allow_unused=False,
+            )
+
+            grad_1 = dx[0].numpy()
+
+        with base.dygraph.guard():
+            paddle.seed(123)
+            if paddle.framework.use_pir_api():
+                with paddle.pir_utils.OldIrGuard():
+                    # Note: dygraph use self.main_program.global_block().create_parameter(), it's need manual seed to old Program
+                    paddle.framework.random._manual_program_seed(123)
+                paddle.framework.random._manual_program_seed(123)
+            else:
+                paddle.framework.random._manual_program_seed(123)
+            a = paddle.to_tensor(value)
+            a.stop_gradient = False
+
+            out = model_f(a)
+            out.backward()
+
+            grad_2 = a.gradient()
+
+        np.testing.assert_array_equal(grad_1, grad_2)
+
+
+class TestDoubleGradResNet(TestCase):
+    def setUp(self):
+        paddle.seed(123)
+        if paddle.framework.use_pir_api():
+            with paddle.pir_utils.OldIrGuard():
+                # Note: dygraph use self.main_program.global_block().create_parameter(), it's need manual seed to old Program
+                paddle.framework.random._manual_program_seed(123)
+            paddle.framework.random._manual_program_seed(123)
+        else:
+            paddle.framework.random._manual_program_seed(123)
+        self.data = np.random.rand(1, 3, 224, 224).astype(np.float32)
+
+    @dygraph_guard
+    def test_resnet_resnet50(self):
+        model = resnet50(pretrained=False)
+        egr_data = paddle.to_tensor(self.data)
+        egr_data.stop_gradient = False
+        egr_out = model(egr_data)
+        egr_preds = paddle.argmax(egr_out, axis=1)
+        egr_label_onehot = paddle.nn.functional.one_hot(
+            paddle.to_tensor(egr_preds), num_classes=egr_out.shape[1]
+        )
+        egr_target = paddle.sum(egr_out * egr_label_onehot, axis=1)
+
+        egr_g = paddle.grad(outputs=egr_target, inputs=egr_out)[0]
+        egr_g_numpy = egr_g.numpy()
+        self.assertEqual(list(egr_g_numpy.shape), list(egr_out.shape))
+
+        model = resnet50(pretrained=False)
+        data = paddle.to_tensor(self.data)
+        data.stop_gradient = False
+        out = model(data)
+        preds = paddle.argmax(out, axis=1)
+        label_onehot = paddle.nn.functional.one_hot(
+            paddle.to_tensor(preds), num_classes=out.shape[1]
+        )
+        target = paddle.sum(out * label_onehot, axis=1)
+
+        g = paddle.grad(outputs=target, inputs=out)[0]
+        g_numpy = g.numpy()
+        self.assertEqual(list(g_numpy.shape), list(out.shape))
+
+        np.testing.assert_array_equal(egr_out, out)
+        np.testing.assert_array_equal(egr_g_numpy, g_numpy)
+
+    @dygraph_guard
+    def test_resnet_resnet101(self):
+        model = resnet101(pretrained=False)
+        egr_data = paddle.to_tensor(self.data)
+        egr_data.stop_gradient = False
+        egr_out = model(egr_data)
+        egr_preds = paddle.argmax(egr_out, axis=1)
+        egr_label_onehot = paddle.nn.functional.one_hot(
+            paddle.to_tensor(egr_preds), num_classes=egr_out.shape[1]
+        )
+        egr_target = paddle.sum(egr_out * egr_label_onehot, axis=1)
+
+        egr_g = paddle.grad(outputs=egr_target, inputs=egr_out)[0]
+        egr_g_numpy = egr_g.numpy()
+        self.assertEqual(list(egr_g_numpy.shape), list(egr_out.shape))
+
+        model = resnet101(pretrained=False)
+        data = paddle.to_tensor(self.data)
+        data.stop_gradient = False
+        out = model(data)
+        preds = paddle.argmax(out, axis=1)
+        label_onehot = paddle.nn.functional.one_hot(
+            paddle.to_tensor(preds), num_classes=out.shape[1]
+        )
+        target = paddle.sum(out * label_onehot, axis=1)
+
+        g = paddle.grad(outputs=target, inputs=out)[0]
+        g_numpy = g.numpy()
+        self.assertEqual(list(g_numpy.shape), list(out.shape))
+
+        np.testing.assert_array_equal(egr_out, out)
+        np.testing.assert_array_equal(egr_g_numpy, g_numpy)
+
+
+class TestDoubleGradBasics(TestCase):
+    def test_matmul(self):
+        input_numpy = np.ones([3, 3]) * 2
+        x = paddle.to_tensor(input_numpy, stop_gradient=False, dtype="float32")
+        y = paddle.to_tensor(input_numpy, stop_gradient=False, dtype="float32")
+        grad_out = paddle.to_tensor(
+            np.ones([3, 3]), stop_gradient=False, dtype="float32"
+        )
+
+        out = paddle.matmul(x, y, False, False)
+        new_x_g, new_y_g = paddle.grad(
+            [out], [x, y], [grad_out], retain_graph=True, create_graph=True
+        )
+        new_x_g.backward()
+
+        out_ref = np.ones([3, 3]) * 12.0
+        np.testing.assert_array_equal(out.numpy(), out_ref)
+
+        new_x_g_ref = np.ones([3, 3]) * 6.0
+        new_y_g_ref = np.ones([3, 3]) * 6.0
+        np.testing.assert_array_equal(new_x_g.numpy(), new_x_g_ref)
+        np.testing.assert_array_equal(new_y_g.numpy(), new_y_g_ref)
+
+        x_grad_ref = np.ones([3, 3]) * 0.0
+        np.testing.assert_array_equal(x.grad.numpy(), x_grad_ref)
+
+        y_grad_ref = np.ones([3, 3]) * 3.0
+        np.testing.assert_array_equal(y.grad.numpy(), y_grad_ref)
+
+        grad_out_grad_ref = np.ones([3, 3]) * 6.0
+        np.testing.assert_array_equal(grad_out.grad.numpy(), grad_out_grad_ref)
+
+
+class TestDygraphDoubleGradMatmul(TestCase):
+    # case1: ddy is none, no broadcast,dims != 1
+    def test_matmul_double_grad_case1(self):
+        input_numpy_x = np.random.random([3, 3]).astype("float32")
+        input_numpy_y = np.random.random([3, 3]).astype("float32")
+
+        def actual():
+            x = paddle.to_tensor(input_numpy_x, stop_gradient=False, dtype="float32")
+            y = paddle.to_tensor(input_numpy_y, stop_gradient=False, dtype="float32")
+            out = paddle.matmul(x, y, False, False)
+
+            dout = paddle.to_tensor(
+                np.ones([3, 3]), stop_gradient=False, dtype="float32"
+            )
+            (dx, dy) = paddle.grad(
+                [out], [x, y], [dout], retain_graph=True, create_graph=True
+            )
+            ddx = paddle.to_tensor(
+                np.ones([3, 3]), stop_gradient=False, dtype="float32"
+            )
+            ddy = ddx
+            dx_double_grad, dy_double_grad, ddout = paddle.grad(
+                [dx, dy],
+                [x, y, dout],
+                [ddx, ddy],
+                retain_graph=True,
+                create_graph=True,
+            )
+            return dx_double_grad, dy_double_grad, ddout
+
+        def expected():
+            dx_double_grad_expected = np.matmul(
+                np.ones([3, 3], dtype="float32"),
+                np.ones([3, 3], dtype="float32"),
+            )
+            dy_double_grad_expected = np.matmul(
+                np.ones([3, 3], dtype="float32"),
+                np.ones([3, 3], dtype="float32"),
+            )
+            ddout_expected1 = np.matmul(np.ones([3, 3], dtype="float32"), input_numpy_y)
+            ddout_expected2 = np.matmul(input_numpy_x, np.ones([3, 3], dtype="float32"))
+            ddout_expected = ddout_expected1 + ddout_expected2
+            return (
+                dx_double_grad_expected,
+                dy_double_grad_expected,
+                ddout_expected,
+            )
+
+        expected_results = expected()
+        places = ["cpu"]
+        if paddle.is_compiled_with_cuda() or is_custom_device():
+            places.append(get_device())
+        for place in places:
+            paddle.device.set_device(place)
+            actual_results = actual()
+            for expected_result, actual_result in zip(expected_results, actual_results):
+                np.testing.assert_allclose(expected_result, actual_result, rtol=1e-6)
+
+    # case2: ddx is none,no broadcast, dims != 1
+    def test_matmul_double_grad_case2(self):
+        input_numpy_x = np.random.random([3, 3]).astype("float32")
+        input_numpy_y = np.random.random([3, 3]).astype("float32")
+
+        def actual():
+            x = paddle.to_tensor(input_numpy_x, stop_gradient=False, dtype="float32")
+            y = paddle.to_tensor(input_numpy_y, stop_gradient=False, dtype="float32")
+            out = paddle.matmul(x, y, False, False)
+
+            dout = paddle.to_tensor(
+                np.ones([3, 3]), stop_gradient=False, dtype="float32"
+            )
+            (dy,) = paddle.grad(
+                [out], [y], [dout], retain_graph=True, create_graph=True
+            )
+            ddy = paddle.to_tensor(
+                np.ones([3, 3]), stop_gradient=False, dtype="float32"
+            )
+            # when x isnot be differentiate in first grad dy in second grad could be None in composite op
+            dx_double_grad, ddout = paddle.grad(
+                [dy],
+                [x, dout],
+                [ddy],
+                retain_graph=True,
+                create_graph=True,
+            )
+            return dx_double_grad, ddout
+
+        def expected():
+            dx_double_grad_expected = np.matmul(
+                np.ones([3, 3], dtype="float32"),
+                np.ones([3, 3], dtype="float32"),
+            )
+            ddout_expected = np.matmul(input_numpy_x, np.ones([3, 3], dtype="float32"))
+            return (
+                dx_double_grad_expected,
+                ddout_expected,
+            )
+
+        expected_results = expected()
+        places = ["cpu"]
+        if paddle.is_compiled_with_cuda() or is_custom_device():
+            places.append(get_device())
+        for place in places:
+            paddle.device.set_device(place)
+            actual_results = actual()
+            for expected_result, actual_result in zip(expected_results, actual_results):
+                np.testing.assert_allclose(expected_result, actual_result, rtol=1e-6)
+
+    # case3: ddx is none, dims = 1
+    def test_matmul_double_grad_case3(self):
+        input_numpy_x = np.random.random([3]).astype("float32")
+        input_numpy_y = np.random.random([3]).astype("float32")
+
+        def actual():
+            x = paddle.to_tensor(input_numpy_x, stop_gradient=False, dtype="float32")
+            y = paddle.to_tensor(input_numpy_y, stop_gradient=False, dtype="float32")
+            out = paddle.matmul(x, y, False, False)
+
+            dout = paddle.to_tensor(np.ones([1]), stop_gradient=False, dtype="float32")
+            (dy,) = paddle.grad(
+                [out], [y], [dout], retain_graph=True, create_graph=True
+            )
+            ddy = paddle.to_tensor(np.ones([3]), stop_gradient=False, dtype="float32")
+            # when x is not be differentiate in first grad, dy from second grad could be None in composite api.
+            dx_double_grad, ddout = paddle.grad(
+                [dy],
+                [x, dout],
+                [ddy],
+                retain_graph=True,
+                create_graph=True,
+            )
+            return dx_double_grad, ddout
+
+        def expected():
+            dx_double_grad_expected = np.ones([3], dtype="float32")
+            ddout_expected = np.matmul(input_numpy_x, np.ones([3], dtype="float32"))
+            return (
+                dx_double_grad_expected,
+                ddout_expected,
+            )
+
+        expected_results = expected()
+        places = ["cpu"]
+        if paddle.is_compiled_with_cuda() or is_custom_device():
+            places.append(get_device())
+        for place in places:
+            paddle.device.set_device(place)
+            actual_results = actual()
+            for expected_result, actual_result in zip(expected_results, actual_results):
+                np.testing.assert_allclose(expected_result, actual_result, rtol=1e-6)
+
+    # case4: ddy is none, dims = 1
+    def test_matmul_double_grad_case4(self):
+        input_numpy_x = np.random.random([3]).astype("float32")
+        input_numpy_y = np.random.random([3]).astype("float32")
+
+        def actual():
+            x = paddle.to_tensor(input_numpy_x, stop_gradient=False, dtype="float32")
+            y = paddle.to_tensor(input_numpy_y, stop_gradient=False, dtype="float32")
+            out = paddle.matmul(x, y, False, False)
+
+            dout = paddle.to_tensor(np.ones([1]), stop_gradient=False, dtype="float32")
+            (dx,) = paddle.grad(
+                [out], [x], [dout], retain_graph=True, create_graph=True
+            )
+            ddx = paddle.to_tensor(np.ones([3]), stop_gradient=False, dtype="float32")
+            # when y is not be differentiate in first grad, dx from second grad could be None in composite api.
+            dy_double_grad, ddout = paddle.grad(
+                [dx],
+                [y, dout],
+                [ddx],
+                retain_graph=True,
+                create_graph=True,
+            )
+            return dy_double_grad, ddout
+
+        def expected():
+            dy_double_grad_expected = np.ones([3], dtype="float32")
+            ddout_expected = np.matmul(input_numpy_y, np.ones([3], dtype="float32"))
+            return (
+                dy_double_grad_expected,
+                ddout_expected,
+            )
+
+        expected_results = expected()
+        places = ["cpu"]
+        if paddle.is_compiled_with_cuda() or is_custom_device():
+            places.append(get_device())
+        for place in places:
+            paddle.device.set_device(place)
+            actual_results = actual()
+
+            for expected_result, actual_result in zip(expected_results, actual_results):
+                np.testing.assert_allclose(expected_result, actual_result, rtol=1e-6)
+
+    # case5: ddx is none, broadcast, dims != 1
+    def test_matmul_double_grad_case5(self):
+        input_numpy_x = np.random.random([2, 1]).astype("float32")
+        input_numpy_y = np.random.random([1]).astype("float32")
+
+        def actual():
+            x = paddle.to_tensor(input_numpy_x, stop_gradient=False, dtype="float32")
+            y = paddle.to_tensor(input_numpy_y, stop_gradient=False, dtype="float32")
+            out = paddle.matmul(x, y, False, False)
+
+            dout = paddle.to_tensor(np.ones([2]), stop_gradient=False, dtype="float32")
+            (dy,) = paddle.grad(
+                [out], [y], [dout], retain_graph=True, create_graph=True
+            )
+            ddy = paddle.to_tensor(np.ones([1]), stop_gradient=False, dtype="float32")
+            dx_double_grad, ddout = paddle.grad(
+                [dy],
+                [x, dout],
+                [ddy],
+                retain_graph=True,
+                create_graph=True,
+            )
+            return dx_double_grad, ddout
+
+        def expected():
+            dx_double_grad_expected = np.ones([2, 1], dtype="float32")
+            ddout_expected = np.matmul(input_numpy_x, np.ones([1], dtype="float32"))
+            return (
+                dx_double_grad_expected,
+                ddout_expected,
+            )
+
+        expected_results = expected()
+        places = ["cpu"]
+        if paddle.is_compiled_with_cuda() or is_custom_device():
+            places.append(get_device())
+        for place in places:
+            paddle.device.set_device(place)
+            actual_results = actual()
+            for expected_result, actual_result in zip(expected_results, actual_results):
+                np.testing.assert_allclose(expected_result, actual_result, rtol=1e-6)
+
+    # case6: ddy is none, broadcast, dims != 1
+    def test_matmul_double_grad_case6(self):
+        input_numpy_x = np.random.random([2, 1]).astype("float32")
+        input_numpy_y = np.random.random([1]).astype("float32")
+
+        def actual():
+            x = paddle.to_tensor(input_numpy_x, stop_gradient=False, dtype="float32")
+            y = paddle.to_tensor(input_numpy_y, stop_gradient=False, dtype="float32")
+            out = paddle.matmul(x, y, False, False)
+
+            dout = paddle.to_tensor(np.ones([2]), stop_gradient=False, dtype="float32")
+            (dx,) = paddle.grad(
+                [out], [x], [dout], retain_graph=True, create_graph=True
+            )
+            ddx = paddle.to_tensor(
+                np.ones([2, 1]), stop_gradient=False, dtype="float32"
+            )
+            dy_double_grad, ddout = paddle.grad(
+                [dx],
+                [y, dout],
+                [ddx],
+                retain_graph=True,
+                create_graph=True,
+            )
+            return dy_double_grad, ddout
+
+        def expected():
+            dy_double_grad_expected = np.ones([1], dtype="float32") * 2
+            ddout_expected = np.ones([2], dtype="float32") * input_numpy_y[0]
+            return (
+                dy_double_grad_expected,
+                ddout_expected,
+            )
+
+        expected_results = expected()
+        places = ["cpu"]
+        if paddle.is_compiled_with_cuda() or is_custom_device():
+            places.append(get_device())
+        for place in places:
+            paddle.device.set_device(place)
+            actual_results = actual()
+            for expected_result, actual_result in zip(expected_results, actual_results):
+                np.testing.assert_allclose(expected_result, actual_result, rtol=1e-6)
+
+    # TODO(Ruting) test complex dtype when composite api support
+    """
+    # case7: ddx is none, dims = 1, complex dtype
+    def test_matmul_double_grad_case7(self):
+        input_numpy_x = np.random.random([3]).astype(
+            'float32'
+        ) + 1j * np.random.random([3]).astype('float32')
+        input_numpy_y = np.random.random([3]).astype(
+            'float32'
+        ) + 1j * np.random.random([3]).astype('float32')
+        input_numpy_y_conj = np.conjugate(input_numpy_y)
+
+        def actual():
+            x = paddle.to_tensor(
+                input_numpy_x, stop_gradient=False, dtype='complex64'
+            )
+            y = paddle.to_tensor(
+                input_numpy_y, stop_gradient=False, dtype='complex64'
+            )
+            out = paddle.matmul(x, y, False, False)
+
+            dout = paddle.to_tensor(
+                np.ones([1]), stop_gradient=False, dtype='complex64'
+            )
+            (dx,) = paddle.grad(
+                [out], [x], [dout], retain_graph=True, create_graph=True
+            )
+            ddx = paddle.to_tensor(
+                np.ones([3]), stop_gradient=False, dtype='complex64'
+            )
+            # when y is not be differentiate in first grad, dx from second grad could be None in composite api.
+            dy_double_grad, ddout = paddle.grad(
+                [dx],
+                [y, dout],
+                [ddx],
+                retain_graph=True,
+                create_graph=True,
+            )
+            return dy_double_grad, ddout
+
+        def expected():
+            dy_double_grad_expected = np.ones(
+                [3], dtype="float32"
+            ) + 0j * np.ones([3], dtype="float32")
+            ddout_expected = np.matmul(
+                input_numpy_y_conj, np.ones([3], dtype="float32")
+            )
+            return (
+                dy_double_grad_expected,
+                ddout_expected,
+            )
+
+        expected_results = expected()
+        places = ["cpu"]
+        if (paddle.is_compiled_with_cuda() or is_custom_device()):
+            places.append(get_device())
+        for place in places:
+            paddle.device.set_device(place)
+            actual_results = actual()
+            for expected_result, actual_result in zip(
+                expected_results, actual_results
+            ):
+                np.testing.assert_allclose(
+                    expected_result, actual_result, rtol=1e-6
+                )
+
+
+    # case8: ddy is none, dims = 1, complex dtype
+    def test_matmul_double_grad_case8(self):
+        input_numpy_x = np.random.random([3]).astype(
+            'float32'
+        ) + 1j * np.random.random([3]).astype('float32')
+        input_numpy_y = np.random.random([3]).astype(
+            'float32'
+        ) + 1j * np.random.random([3]).astype('float32')
+        input_numpy_x_conj = np.conjugate(input_numpy_x)
+
+        def actual():
+            x = paddle.to_tensor(
+                input_numpy_x, stop_gradient=False, dtype='complex64'
+            )
+            y = paddle.to_tensor(
+                input_numpy_y, stop_gradient=False, dtype='complex64'
+            )
+            out = paddle.matmul(x, y, False, False)
+
+            dout = paddle.to_tensor(
+                np.ones([1]), stop_gradient=False, dtype='complex64'
+            )
+            (dy,) = paddle.grad(
+                [out], [y], [dout], retain_graph=True, create_graph=True
+            )
+            ddy = paddle.to_tensor(
+                np.ones([3]), stop_gradient=False, dtype='complex64'
+            )
+            dx_double_grad, ddout = paddle.grad(
+                [dy],
+                [x, dout],
+                [ddy],
+                retain_graph=True,
+                create_graph=True,
+            )
+            return dx_double_grad, ddout
+
+        def expected():
+            dx_double_grad_expected = np.ones([3], dtype="float32")
+            ddout_expected = np.matmul(
+                input_numpy_x_conj, np.ones([3], dtype="float32")
+            )
+            return (
+                dx_double_grad_expected,
+                ddout_expected,
+            )
+
+        expected_results = expected()
+        places = ["cpu"]
+        if (paddle.is_compiled_with_cuda() or is_custom_device()):
+            places.append(get_device())
+        for place in places:
+            paddle.device.set_device(place)
+            actual_results = actual()
+            for expected_result, actual_result in zip(
+                expected_results, actual_results
+            ):
+                np.testing.assert_allclose(
+                    expected_result, actual_result, rtol=1e-6
+                )
+    """
+
+    def test_value_error(self):
+        def test():
+            import paddle
+            from paddle import nn
+
+            model = nn.Sequential(nn.Linear(3, 4))
+
+            x = paddle.randn([4, 1])
+            y = paddle.randn([4, 1])
+            z = paddle.randn([4, 1])
+            x.stop_gradient = False
+            y.stop_gradient = False
+            z.stop_gradient = False
+            out = model(paddle.concat((x, y, z), axis=1))
+
+            data = {
+                "x": x,
+                "y": y,
+                "z": z,
+                "u": out[:, 0:1],
+                "v": out[:, 1:2],
+                "w": out[:, 2:3],
+                "p": out[:, 3:4],
+            }
+
+            v = out[:, 1:2]
+            z = paddle.grad(v, x, create_graph=True)[0]
+            zz = paddle.grad(z, x, create_graph=True)[0]
+
+        with self.assertRaises(ValueError):
+            test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_linalg_matrix_exp_metax.py b/backends/metax_gpu/tests/unit_test/test_linalg_matrix_exp_metax.py
new file mode 100644
index 00000000000..e39de09d6e4
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_linalg_matrix_exp_metax.py
@@ -0,0 +1,268 @@
+#   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import unittest
+
+import numpy as np
+import scipy
+from op_test import get_places
+
+import paddle
+
+from paddle.base import core
+
+core.set_cublas_switch(False)
+
+os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
+
+if sys.platform == "win32":
+    RTOL = {"float32": 1e-02, "float64": 1e-04}
+    ATOL = {"float32": 1e-02, "float64": 1e-04}
+elif sys.platform == "darwin":
+    RTOL = {"float32": 1e-06, "float64": 1e-12}
+    ATOL = {"float32": 1e-06, "float64": 1e-12}
+elif scipy.__version__ < "1.15":
+    RTOL = {"float32": 1e-06, "float64": 1e-15}
+    ATOL = {"float32": 1e-06, "float64": 1e-15}
+else:
+    RTOL = {"float32": 1e-06, "float64": 1e-13}
+    ATOL = {"float32": 1e-06, "float64": 1e-13}
+
+
+class MatrixExpTestCase(unittest.TestCase):
+    def setUp(self):
+        self.init_config()
+        self.generate_input()
+        self.generate_output()
+        self.places = get_places()
+
+    def generate_input(self):
+        self._input_shape = (5, 5)
+        np.random.seed(123)
+        self._input_data = np.random.random(self._input_shape).astype(self.dtype)
+
+    def generate_output(self):
+        self._output_data = scipy.linalg.expm(self._input_data)
+
+    def init_config(self):
+        self.dtype = "float64"
+
+    def test_dygraph(self):
+        for place in self.places:
+            paddle.disable_static(place)
+            x = paddle.to_tensor(self._input_data, place=place)
+            out = paddle.linalg.matrix_exp(x).numpy()
+
+            np.testing.assert_allclose(
+                out,
+                self._output_data,
+                rtol=RTOL.get(self.dtype),
+                atol=ATOL.get(self.dtype),
+            )
+
+    # TODO(megemini): cond/while_loop should be tested in pir
+    #
+    def test_static(self):
+        paddle.enable_static()
+
+        for place in get_places():
+            with paddle.static.program_guard(
+                paddle.static.Program(), paddle.static.Program()
+            ):
+                x = paddle.static.data(
+                    name="input",
+                    shape=self._input_shape,
+                    dtype=self._input_data.dtype,
+                )
+
+                out = paddle.linalg.matrix_exp(x)
+                exe = paddle.static.Executor(place)
+
+                res = exe.run(
+                    feed={"input": self._input_data},
+                    fetch_list=[out],
+                )[0]
+
+            np.testing.assert_allclose(
+                res,
+                self._output_data,
+                rtol=RTOL.get(self.dtype),
+                atol=ATOL.get(self.dtype),
+            )
+
+    def test_grad(self):
+        for place in self.places:
+            x = paddle.to_tensor(self._input_data, place=place, stop_gradient=False)
+            out = paddle.linalg.matrix_exp(x)
+            out.backward()
+            x_grad = x.grad
+
+            self.assertEqual(list(x_grad.shape), list(x.shape))
+            self.assertEqual(x_grad.dtype, x.dtype)
+
+
+class MatrixExpTestCaseFloat32(MatrixExpTestCase):
+    def init_config(self):
+        self.dtype = "float32"
+
+
+class MatrixExpTestCase3D(MatrixExpTestCase):
+    def generate_input(self):
+        self._input_shape = (2, 5, 5)
+        np.random.seed(123)
+        self._input_data = np.random.random(self._input_shape).astype(self.dtype)
+
+
+class MatrixExpTestCase3DFloat32(MatrixExpTestCase3D):
+    def init_config(self):
+        self.dtype = "float32"
+
+
+class MatrixExpTestCase4D(MatrixExpTestCase):
+    def generate_input(self):
+        self._input_shape = (2, 3, 5, 5)
+        np.random.seed(123)
+        self._input_data = np.random.random(self._input_shape).astype(self.dtype)
+
+
+class MatrixExpTestCase4DFloat32(MatrixExpTestCase4D):
+    def init_config(self):
+        self.dtype = "float32"
+
+
+class MatrixExpTestCaseEmpty(MatrixExpTestCase):
+    def generate_input(self):
+        self._input_shape = ()
+        np.random.seed(123)
+        self._input_data = np.random.random(self._input_shape).astype(self.dtype)
+
+
+class MatrixExpTestCaseEmptyFloat32(MatrixExpTestCaseEmpty):
+    def init_config(self):
+        self.dtype = "float32"
+
+
+class MatrixExpTestCaseScalar(MatrixExpTestCase):
+    def generate_input(self):
+        self._input_shape = (2, 3, 1, 1)
+        np.random.seed(123)
+        self._input_data = np.random.random(self._input_shape).astype(self.dtype)
+
+
+class MatrixExpTestCaseScalarFloat32(MatrixExpTestCaseScalar):
+    def init_config(self):
+        self.dtype = "float32"
+
+
+# test precision for float32 with l1_norm comparing `conds`
+class MatrixExpTestCasePrecisionFloat32L1norm0(MatrixExpTestCase):
+    def init_config(self):
+        self.dtype = "float32"
+
+    def generate_input(self):
+        self._input_shape = (2, 2)
+        self._input_data = np.array([[0, 0.2], [-0.2, 0]]).astype(self.dtype)
+
+
+class MatrixExpTestCasePrecisionFloat32L1norm1(MatrixExpTestCase):
+    def init_config(self):
+        self.dtype = "float32"
+
+    def generate_input(self):
+        self._input_shape = (2, 2)
+        self._input_data = np.array([[0, 0.8], [-0.8, 0]]).astype(self.dtype)
+
+
+class MatrixExpTestCasePrecisionFloat32L1norm2(MatrixExpTestCase):
+    def init_config(self):
+        self.dtype = "float32"
+
+    def generate_input(self):
+        self._input_shape = (2, 2)
+        self._input_data = np.array([[0, 2.0], [-2.0, 0]]).astype(self.dtype)
+
+
+# test precision for float64 with l1_norm comparing `conds`
+class MatrixExpTestCasePrecisionFloat64L1norm0(MatrixExpTestCase):
+    def init_config(self):
+        self.dtype = "float64"
+
+    def generate_input(self):
+        self._input_shape = (2, 2)
+        self._input_data = np.array([[0, 0.01], [-0.01, 0]]).astype(self.dtype)
+
+
+class MatrixExpTestCasePrecisionFloat64L1norm1(MatrixExpTestCase):
+    def init_config(self):
+        self.dtype = "float64"
+
+    def generate_input(self):
+        self._input_shape = (2, 2)
+        self._input_data = np.array([[0, 0.1], [-0.1, 0]]).astype(self.dtype)
+
+
+class MatrixExpTestCasePrecisionFloat64L1norm2(MatrixExpTestCase):
+    def init_config(self):
+        self.dtype = "float64"
+
+    def generate_input(self):
+        self._input_shape = (2, 2)
+        self._input_data = np.array([[0, 0.5], [-0.5, 0]]).astype(self.dtype)
+
+
+class MatrixExpTestCasePrecisionFloat64L1norm3(MatrixExpTestCase):
+    def init_config(self):
+        self.dtype = "float64"
+
+    def generate_input(self):
+        self._input_shape = (2, 2)
+        self._input_data = np.array([[0, 1.5], [-1.5, 0]]).astype(self.dtype)
+
+
+class MatrixExpTestCasePrecisionFloat64L1norm4(MatrixExpTestCase):
+    def init_config(self):
+        self.dtype = "float64"
+
+    def generate_input(self):
+        self._input_shape = (2, 2)
+        self._input_data = np.array([[0, 2.5], [-2.5, 0]]).astype(self.dtype)
+
+
+# test error cases
+class MatrixExpTestCaseError(unittest.TestCase):
+    def test_error_dtype(self):
+        with self.assertRaises(ValueError):
+            x = np.array(123, dtype=int)
+            paddle.linalg.matrix_exp(x)
+
+    def test_error_ndim(self):
+        # 1-d
+        with self.assertRaises(ValueError):
+            x = np.random.rand(1)
+            paddle.linalg.matrix_exp(x)
+
+        # not square
+        with self.assertRaises(ValueError):
+            x = np.random.rand(3, 4)
+            paddle.linalg.matrix_exp(x)
+
+        with self.assertRaises(ValueError):
+            x = np.random.rand(2, 3, 4)
+            paddle.linalg.matrix_exp(x)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 07b41e0823c0dc588b3bc048d18c97059cae56e2 Mon Sep 17 00:00:00 2001
From: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com>
Date: Thu, 16 Oct 2025 13:48:11 +0800
Subject: [PATCH 067/121] [metax] support wint4 in quantize (#103)

---
 .../weight_quantize_kernel_register.cu            | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu
index 4e2a4ce240c..44ac7f2fddc 100644
--- a/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu
@@ -115,11 +115,12 @@ void WeightQuantizeKernel(const Context& dev_ctx,
     dev_ctx.template Alloc<T>(scale);
     weight_quant_gpu<T, Context>(dev_ctx,
                                  x.data<T>(),
-                                 quanted_x.data<int8_t>(),
+                                 out->data<int8_t>(),
                                  scale->data<T>(),
                                  weight_shape,
                                  arch,
                                  algo);
+    out->Resize({m, n});
 #ifdef PADDLE_WITH_HIP
     DenseTensor x_int_tmp(out->type());
     x_int_tmp.Resize({m, n / 2});
@@ -133,12 +134,12 @@ void WeightQuantizeKernel(const Context& dev_ctx,
     funcs::Transpose<Context, int8_t, 2> trans;
     trans(dev_ctx, x_int_tmp, out, axis);
 #else
-    weight_permute_gpu<Context>(dev_ctx,
-                                quanted_x.data<int8_t>(),
-                                out->data<int8_t>(),
-                                weight_shape,
-                                arch,
-                                algo);
+    // weight_permute_gpu<Context>(dev_ctx,
+    //                             quanted_x.data<int8_t>(),
+    //                             out->data<int8_t>(),
+    //                             weight_shape,
+    //                             arch,
+    //                             algo);
 #endif
   } else if (algo == "w4a8") {
     weight_permute_gpu_w4a8<Context>(dev_ctx,

From 581a9e2824fa38aeec47e3c158b51d4d988821c3 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Thu, 16 Oct 2025 14:30:35 +0800
Subject: [PATCH 068/121] updata_metax (#104)

* test

* test

---------
---
 .github/workflows/metax_work.yaml | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/metax_work.yaml b/.github/workflows/metax_work.yaml
index fd7d04c0843..360846846c2 100644
--- a/.github/workflows/metax_work.yaml
+++ b/.github/workflows/metax_work.yaml
@@ -5,12 +5,6 @@ on:
   pull_request:
     types: [opened, synchronize]
     branches: [develop, release/**]
-    paths:
-      - "**"
-      - "Paddle/**"
-      - "!backends/**"
-      - "backends/metax_gpu/**"
-
 permissions: read-all
 
 defaults:
@@ -34,18 +28,33 @@ jobs:
             --jobs=8 \
             --branch ${{ github.base_ref || github.ref_name}} \
             --recurse-submodules \
-            https://${{ github.actor }}:${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }}.git .
+            https://${{ github.actor }}:${{ secrets.GITHUB_TOKEN }}@github.com/PaddlePaddle/PaddleCustomDevice.git .
 
           if [ "${{ github.event_name }}" == "pull_request" ]; then
             git fetch origin pull/${{ github.event.pull_request.number }}/head:pull/${{ github.event.pull_request.number }}/head
             git checkout pull/${{ github.event.pull_request.number }}/head
 
-            git submodule update --init --recursive
+            paddle_branch=${{ github.base_ref || github.ref_name}}
+            change_numbers=$(git diff --name-only remotes/origin/${paddle_branch} | wc -l)
+            change_backend=$(git diff --name-only remotes/origin/${paddle_branch} | grep "backends/"| wc -l)
+            change_metax_only=$(git diff --name-only remotes/origin/${paddle_branch} | grep "backends/metax_gpu"| wc -l)
+            git diff --name-only remotes/origin/${paddle_branch}
+
+            if [ $change_numbers -ne $change_backend ]; then
+              echo "Common file changed, continue to run metax FULL CI test ..."
+            elif [ $paddle_branch -eq 0 ] ; then
+              echo "NO metax backend changes found, skip metax FULL CI ...."
+              exit 0
+            fi
+
+
+            # git submodule update --init --recursive
           fi
 
 
       - name: compile
         run: |
+          sleep 10000
           cd backends/metax_gpu
           bash build.sh
 

From 4ab7f5456a2bb339a667b1c117fe7fbf281c118e Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Thu, 16 Oct 2025 14:56:32 +0800
Subject: [PATCH 069/121] updata_metax (#105)

* chang_meatx_yaml

* chang_meatx_yaml

* updata_metax

* test

* test

* test

* test

---------
---
 .github/workflows/metax_work.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/metax_work.yaml b/.github/workflows/metax_work.yaml
index 360846846c2..bdedcaa7c8e 100644
--- a/.github/workflows/metax_work.yaml
+++ b/.github/workflows/metax_work.yaml
@@ -54,7 +54,7 @@ jobs:
 
       - name: compile
         run: |
-          sleep 10000
+          # sleep 10000
           cd backends/metax_gpu
           bash build.sh
 

From ef5306d1032ff492091ebdff47bae64c526eafb6 Mon Sep 17 00:00:00 2001
From: ZhouDuan <136539532+1184319564@users.noreply.github.com>
Date: Thu, 16 Oct 2025 17:09:38 +0800
Subject: [PATCH 070/121] add one test to metax (#107)

* [Metax_change_ut]

* fix sum&collect_fpn_proposals op register

* modify profile

* [Metax] fix paddle bug replace 'MoeGradDispatchKernel' to 'MoeGateDispatchKernel'

* [Metax] register bce_loss_grad & bce_loss & index_add_grad kernels

* [Metax] con2d_grad use gpudnn

* blas handle support

* [Metax] register some kernels & update CMakeLists

* [Metax] fix metax unittest fail

* [Metax] add group_norm & label_smooth kernel and update matmul kernel

* [Metax] fix rmsprop kernel register and add meshgrid & meshgrid_grad kernel register

* add test

* add test

* [test]  chang the logic of workspace_host in cholesky_kernel_register

alloc(cpuplace,size), test pass
alloc(cpuplace, size, stream), crash

* [Metax] fix compile fail

* Revert "[Metax] fix compile fail"

This reverts commit 83bc87f686227962b0262e044225c6ed5507b824.

* [Metax] fix compile fail by 'conv_transpose_grad_kernel_impl.h'

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] con2d_grad use gpudnn

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] change_patch

* [Metax] update unit test CMakeLists.txt

* [Metax] update unit test CMakeLists.txt

* [feature] add unique_consecutive kernel

* [metax] add some kernel

* [metax] add some kernel

* [Metax] register baddbmm kernel & update blas api

* [Metax] register baddbmm kernel & update blas api

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [feature]  add add unique_consecutive kernel.cu

* [fix] fix some test case due to missing op register

* [fix]  fix some fail text

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax]fix lu eigvalshsqueeze rnn kernel

* add and fix some kernels

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [Metax] fix conflict

* [Metax] adapt to paddle-cpu-20250901 & resolve the issue of 'test_elementwise_mul_op_metax' failure

* [Metax] update repeat_interleave kernel & ignore max op test

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax] chang patch fix copy

* [metax] chang patch fix copy

* [Metax] update metax_gpu unit test

* [Metax] fix test CMakeList.txt

* fix some tests

* add one test

---------

Co-authored-by: sw <1640472053@qq.com>
Co-authored-by: duqimeng <77875733+duqimeng@users.noreply.github.com>
Co-authored-by: Mingkun.Zhang <2496808993@qq.com>
Co-authored-by: metax666 <metax_pde@outlook.com>
Co-authored-by: jiaxinWang-metax <189149612@qq.com>
Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Co-authored-by: chezhang <1376507468@qq.com>
Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com>
---
 .../test_fused_conv2d_add_act_op_metax.py     | 429 ++++++++++++++++++
 1 file changed, 429 insertions(+)
 create mode 100644 backends/metax_gpu/tests/unit_test/test_fused_conv2d_add_act_op_metax.py

diff --git a/backends/metax_gpu/tests/unit_test/test_fused_conv2d_add_act_op_metax.py b/backends/metax_gpu/tests/unit_test/test_fused_conv2d_add_act_op_metax.py
new file mode 100644
index 00000000000..2b405a76367
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_fused_conv2d_add_act_op_metax.py
@@ -0,0 +1,429 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from op_test import OpTest, get_device_place, is_custom_device
+from test_conv2d_op import conv2d_forward_naive
+
+from paddle.base import core
+
+core.set_cudnn_switch(False)
+
+
+def create_test_padding_SAME_class(parent):
+    class TestPaddingSAMECase(parent):
+        def init_paddings(self):
+            self.pad = [0, 0]
+            self.padding_algorithm = "SAME"
+
+    cls_name = "{}_{}".format(parent.__name__, "PaddingSAMEOp")
+    TestPaddingSAMECase.__name__ = cls_name
+    globals()[cls_name] = TestPaddingSAMECase
+
+
+def create_test_padding_VALID_class(parent):
+    class TestPaddingVALIDCase(parent):
+        def init_paddings(self):
+            self.pad = [1, 1]
+            self.padding_algorithm = "VALID"
+
+    cls_name = "{}_{}".format(parent.__name__, "PaddingVALIDOp")
+    TestPaddingVALIDCase.__name__ = cls_name
+    globals()[cls_name] = TestPaddingVALIDCase
+
+
+def create_test_cudnn_channel_last_class(parent):
+    @unittest.skipIf(
+        not (core.is_compiled_with_cuda() or is_custom_device()),
+        "core is not compiled with CUDA",
+    )
+    class TestCudnnChannelLastCase(parent):
+        def init_test_case(self):
+            super().init_test_case()
+            self.data_format = "NHWC"
+            N, C, H, W = self.input_size
+            self.input_size = [N, H, W, C]
+            K1, K2, R, S = self.filter_size
+            self.filter_size = [K1, R, S, K2]
+
+        def test_check_output(self):
+            print(self.attrs)
+            if self.has_cuda():
+                place = get_device_place()
+                self.check_output_with_place(place, atol=1e-5, check_dygraph=False)
+
+    cls_name = "{}_{}".format(parent.__name__, "CudnnChannelLast")
+    TestCudnnChannelLastCase.__name__ = cls_name
+    globals()[cls_name] = TestCudnnChannelLastCase
+
+
+class TestFusedConv2dAddActOp(OpTest):
+    def setUp(self):
+        self.op_type = "fused_conv2d_add_act"
+        self.exhaustive_search = False
+        self.data_format = "NCHW"
+        self.dtype = np.float32
+        self.activation = "relu"
+        self.add_residual_data = True
+        self.split_channels = None
+        self.outputs = None
+        self.padding_algorithm = "EXIPLICIT"
+
+        self.init_group()
+        self.init_dilation()
+        self.init_test_case()
+        self.init_residual()
+        self.init_activation()
+        self.init_paddings()
+        self.set_search_method()
+
+        conv2d_param = {
+            "stride": self.stride,
+            "pad": self.pad,
+            "dilation": self.dilations,
+        }
+
+        input = np.random.random(self.input_size).astype(self.dtype)
+        filter = np.random.random(self.filter_size).astype(self.dtype)
+        bias = np.random.random(self.filter_size[0]).astype(self.dtype)
+
+        if self.data_format == "NHWC":
+            filter_nchw = np.transpose(filter, [0, 3, 1, 2])
+        else:
+            filter_nchw = filter
+
+        self.output, _, _, _, _ = conv2d_forward_naive(
+            input,
+            filter_nchw,
+            self.groups,
+            conv2d_param,
+            self.padding_algorithm,
+            self.data_format,
+        )
+
+        self.output = self.output.astype(self.dtype)
+
+        self.inputs = {
+            "Input": OpTest.np_dtype_to_base_dtype(input),
+            "Filter": OpTest.np_dtype_to_base_dtype(filter),
+            "Bias": OpTest.np_dtype_to_base_dtype(bias),
+        }
+
+        if self.add_residual_data:
+            residual_data = np.random.random(self.output.shape).astype(self.dtype)
+            self.inputs["ResidualData"] = OpTest.np_dtype_to_base_dtype(residual_data)
+            self.output += residual_data
+
+        # Add bias
+        if self.data_format == "NCHW":
+            self.output = self.output + bias.reshape((1, bias.size, 1, 1))
+        else:
+            self.output = self.output + bias.reshape((1, 1, 1, bias.size))
+
+        assert self.activation in ["relu", "identity"]
+        if self.activation == "relu":
+            self.output = np.maximum(self.output, 0)
+
+        self.attrs = {
+            "strides": self.stride,
+            "paddings": self.pad,
+            "groups": self.groups,
+            "dilations": self.dilations,
+            "data_format": self.data_format,
+            "exhaustive_search": self.exhaustive_search,
+            "activation": self.activation,
+            "padding_algorithm": self.padding_algorithm,
+        }
+        if self.split_channels is not None:
+            self.attrs["split_channels"] = self.split_channels
+
+        self.outputs = {"Output": self.output}
+
+        self.set_outputs()
+
+    def has_cuda(self):
+        return core.is_compiled_with_cuda() or is_custom_device()
+
+    def test_check_output(self):
+        if self.has_cuda():
+            place = get_device_place()
+            self.check_output_with_place(place, atol=1e-5, check_dygraph=False)
+
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+
+    def init_dilation(self):
+        self.dilations = [1, 1]
+
+    def init_group(self):
+        self.groups = 1
+
+    def init_residual(self):
+        self.add_residual_data = True
+
+    def init_activation(self):
+        self.activation = "relu"
+
+    def set_search_method(self):
+        self.exhaustive_search = False
+
+    def set_outputs(self):
+        pass
+
+    def init_paddings(self):
+        self.pad = [0, 0]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestWithoutResidual(TestFusedConv2dAddActOp):
+    def init_residual(self):
+        self.add_residual_data = False
+
+
+class TestIdentityActivation(TestFusedConv2dAddActOp):
+    def init_activation(self):
+        self.activation = "identity"
+
+
+class TestIdentityActivation1(TestFusedConv2dAddActOp):
+    def init_activation(self):
+        self.activation = "identity"
+        self.add_residual_data = False
+
+
+class TestWithGroup(TestFusedConv2dAddActOp):
+    def init_group(self):
+        self.groups = 3
+
+
+class TestWithDilation(TestFusedConv2dAddActOp):
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 10, 10]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+
+    def init_dilation(self):
+        self.dilations = [2, 2]
+
+    def init_group(self):
+        self.groups = 3
+
+
+class TestCUDNNExhaustiveSearch(TestFusedConv2dAddActOp):
+    def set_search_method(self):
+        self.exhaustive_search = True
+
+
+class TestMultipleOutputs(TestFusedConv2dAddActOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.input_size = [1, 32, 17, 17]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [126, f_c, 3, 3]
+        self.split_channels = [84, 42]
+
+    def set_outputs(self):
+        out1 = self.output[:, 0:84, :, :]
+        out2 = self.output[:, 84:126, :, :]
+        self.outputs["Outputs"] = [("out1", out1), ("out2", out2)]
+
+
+class TestAsyPadding(TestFusedConv2dAddActOp):
+    def init_paddings(self):
+        self.pad = [0, 0, 1, 2]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestWithPad_AsyPadding(TestFusedConv2dAddActOp):
+    def init_test_case(self):
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 10, 10]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+
+    def init_paddings(self):
+        self.pad = [2, 1, 3, 2]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestWithStride_AsyPadding(TestFusedConv2dAddActOp):
+    def init_test_case(self):
+        self.stride = [2, 2]
+        self.input_size = [2, 3, 6, 6]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+
+    def init_paddings(self):
+        self.pad = [2, 1, 3, 2]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestWith1x1_AsyPadding(TestFusedConv2dAddActOp):
+    def init_test_case(self):
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 1, 1]
+
+    def init_group(self):
+        self.groups = 3
+
+    def init_paddings(self):
+        self.pad = [2, 2, 4, 0]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestWithGroup_AsyPadding(TestFusedConv2dAddActOp):
+    def init_group(self):
+        self.groups = 3
+
+
+class TestWithDepthWise3x3_AsyPadding(TestFusedConv2dAddActOp):
+    def init_test_case(self):
+        self.stride = [1, 1]
+        self.input_size = [3, 4, 10, 10]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [8, f_c, 3, 3]
+
+    def init_dilation(self):
+        self.dilations = [2, 2]
+
+    def init_group(self):
+        self.groups = 4
+
+    def init_paddings(self):
+        self.pad = [1, 3, 2, 1]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestWithDepthWise5x5_AsyPadding(TestFusedConv2dAddActOp):
+    def init_test_case(self):
+        self.stride = [1, 1]
+        self.input_size = [2, 4, 10, 10]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [8, f_c, 5, 5]
+
+    def init_group(self):
+        self.groups = 4
+
+    def init_paddings(self):
+        self.pad = [0, 1, 1, 0]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestWithDepthWise7x7_AsyPadding(TestFusedConv2dAddActOp):
+    def init_test_case(self):
+        self.stride = [2, 2]
+        self.input_size = [2, 8, 10, 10]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [16, f_c, 7, 7]
+
+    def init_group(self):
+        self.groups = 8
+
+    def init_paddings(self):
+        self.pad = [1, 3, 4, 1]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestWithDilation_AsyPadding(TestFusedConv2dAddActOp):
+    def init_test_case(self):
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 10, 10]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+
+    def init_dilation(self):
+        self.dilations = [2, 2]
+
+    def init_group(self):
+        self.groups = 3
+
+    def init_paddings(self):
+        self.pad = [0, 1, 3, 0]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestWithInput1x1Filter1x1_AsyPadding(TestFusedConv2dAddActOp):
+    def init_test_case(self):
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 1, 1]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 1, 1]
+
+    def init_group(self):
+        self.groups = 3
+
+    def init_paddings(self):
+        self.pad = [0, 3, 4, 0]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestSimpleNHWC(TestFusedConv2dAddActOp):
+    def init_test_case(self):
+        self.stride = [1, 1]
+        self.input_size = [3, 5, 5, 2]  # NHWC
+        self.data_format = "NHWC"
+        assert np.mod(self.input_size[3], self.groups) == 0
+        f_c = self.input_size[3] // self.groups
+        self.filter_size = [4, 3, 3, f_c]
+
+    def init_group(self):
+        self.groups = 1
+
+    def init_paddings(self):
+        self.pad = [1, 1]
+        self.padding_algorithm = "EXPLICIT"
+
+
+create_test_padding_SAME_class(TestAsyPadding)
+create_test_padding_SAME_class(TestWithPad_AsyPadding)
+create_test_padding_SAME_class(TestWithStride_AsyPadding)
+create_test_padding_SAME_class(TestWithGroup_AsyPadding)
+create_test_padding_SAME_class(TestWithInput1x1Filter1x1_AsyPadding)
+
+create_test_padding_VALID_class(TestAsyPadding)
+create_test_padding_VALID_class(TestWithPad_AsyPadding)
+create_test_padding_VALID_class(TestWithStride_AsyPadding)
+create_test_padding_VALID_class(TestWithGroup_AsyPadding)
+create_test_padding_VALID_class(TestWithInput1x1Filter1x1_AsyPadding)
+
+create_test_cudnn_channel_last_class(TestAsyPadding)
+create_test_cudnn_channel_last_class(TestWithPad_AsyPadding)
+create_test_cudnn_channel_last_class(TestWithStride_AsyPadding)
+create_test_cudnn_channel_last_class(TestWithGroup_AsyPadding)
+create_test_cudnn_channel_last_class(TestWithInput1x1Filter1x1_AsyPadding)
+
+if __name__ == "__main__":
+    unittest.main()

From 027c099c99074b172495f51d21db4504cd810d41 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Thu, 16 Oct 2025 18:55:57 +0800
Subject: [PATCH 071/121] uodata_metax (#106)

* [Metax_change_ut]

* fix sum&collect_fpn_proposals op register

* modify profile

* [Metax] fix paddle bug replace 'MoeGradDispatchKernel' to 'MoeGateDispatchKernel'

* [Metax] register bce_loss_grad & bce_loss & index_add_grad kernels

* [Metax] con2d_grad use gpudnn

* blas handle support

* [Metax] register some kernels & update CMakeLists

* [Metax] fix metax unittest fail

* [Metax] add group_norm & label_smooth kernel and update matmul kernel

* [Metax] fix rmsprop kernel register and add meshgrid & meshgrid_grad kernel register

* add test

* add test

* [test]  chang the logic of workspace_host in cholesky_kernel_register

alloc(cpuplace,size), test pass
alloc(cpuplace, size, stream), crash

* [Metax] fix compile fail

* Revert "[Metax] fix compile fail"

This reverts commit 83bc87f686227962b0262e044225c6ed5507b824.

* [Metax] fix compile fail by 'conv_transpose_grad_kernel_impl.h'

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] con2d_grad use gpudnn

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] change_patch

* [Metax] update unit test CMakeLists.txt

* [Metax] update unit test CMakeLists.txt

* [feature] add unique_consecutive kernel

* [metax] add some kernel

* [metax] add some kernel

* [Metax] register baddbmm kernel & update blas api

* [Metax] register baddbmm kernel & update blas api

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [feature]  add add unique_consecutive kernel.cu

* [fix] fix some test case due to missing op register

* [fix]  fix some fail text

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax]fix lu eigvalshsqueeze rnn kernel

* add and fix some kernels

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [Metax] fix conflict

* [Metax] adapt to paddle-cpu-20250901 & resolve the issue of 'test_elementwise_mul_op_metax' failure

* [Metax] update repeat_interleave kernel & ignore max op test

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax] chang patch fix copy

* [metax] chang patch fix copy

* [Metax] update metax_gpu unit test

* [Metax] fix test CMakeList.txt

* [metax]change_cupti_and_fix_softmax

* [metax]change_patch

* [metax]change_patch

* [metax] updata_qr_kernel

* [metax] updata_qr_kernel

* [Metax] fix cufft and fix some blas kernel apply

* [metax] fix bug

* [Metax] add github action

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]fix_code style and index_elementwise_put_kernel

* [metax]change_build

* [metax]change_build

* change_metax_work

* change_metax_work

* change_metax_work

* change_metax_work

* change_metax_work

* change_warpctc.cmake

* change warpctc.cmake

* test

* change_run_ut

* remove_tets

* test

* add_generate_pb

* [metax]fix paddle bug

* change_ut

* change_ut

* change_ut

* [metax]fix patch and fix missing kernel

* [metax] link mccl and fix missing kernel

* [metax] rename yaml file

* [metax] rm file

* [metax] rm file

* [metax] add Rules

* [metax] change_patch

* update paddle

* [metax] fix dot error

* [metax]rm opt path and fix activation_kernel bug

* updata paddle

* chang_meatx_yaml

* chang_meatx_yaml

* updata_metax

* test

* test

* test

* test

* test

* test

* test

* test

* test

* test

* test

* test

---------

Co-authored-by: Mingkun.Zhang <2496808993@qq.com>
Co-authored-by: metax666 <metax_pde@outlook.com>
Co-authored-by: jiaxinWang-metax <189149612@qq.com>
Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Co-authored-by: chezhang <1376507468@qq.com>
Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com>
Co-authored-by: ZhouDuan <1184319564@qq.com>
---
 .github/workflows/metax_work.yaml | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/metax_work.yaml b/.github/workflows/metax_work.yaml
index bdedcaa7c8e..353cbb098b6 100644
--- a/.github/workflows/metax_work.yaml
+++ b/.github/workflows/metax_work.yaml
@@ -28,22 +28,38 @@ jobs:
             --jobs=8 \
             --branch ${{ github.base_ref || github.ref_name}} \
             --recurse-submodules \
-            https://${{ github.actor }}:${{ secrets.GITHUB_TOKEN }}@github.com/PaddlePaddle/PaddleCustomDevice.git .
+            https://${{ github.actor }}:${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }}.git .
 
           if [ "${{ github.event_name }}" == "pull_request" ]; then
             git fetch origin pull/${{ github.event.pull_request.number }}/head:pull/${{ github.event.pull_request.number }}/head
             git checkout pull/${{ github.event.pull_request.number }}/head
 
+
+
+
             paddle_branch=${{ github.base_ref || github.ref_name}}
+            echo $paddle_branch
+            # sleep 10000
             change_numbers=$(git diff --name-only remotes/origin/${paddle_branch} | wc -l)
-            change_backend=$(git diff --name-only remotes/origin/${paddle_branch} | grep "backends/"| wc -l)
-            change_metax_only=$(git diff --name-only remotes/origin/${paddle_branch} | grep "backends/metax_gpu"| wc -l)
+            echo $change_numbers
+
+
+            change_backend=$(git diff --name-only remotes/origin/${paddle_branch} | grep -c "backends/" || true)
+            echo $change_backend
+            change_metax_only=$(git diff --name-only remotes/origin/${paddle_branch} | grep -c "backends/metax_gpu" || true)
+            echo $change_metax_only
+
+            # change_backend=$(git diff --name-only remotes/origin/${paddle_branch} | grep "backends/"| wc -l)
+            # echo $change_backend
+            # change_metax_only=$(git diff --name-only remotes/origin/${paddle_branch} | grep "backends/metax_gpu"| wc -l)
+            # echo $change_metax_only
+
             git diff --name-only remotes/origin/${paddle_branch}
 
             if [ $change_numbers -ne $change_backend ]; then
               echo "Common file changed, continue to run metax FULL CI test ..."
             elif [ $paddle_branch -eq 0 ] ; then
-              echo "NO metax backend changes found, skip metax FULL CI ...."
+              echo "NO metax backend changes found, skip metax FULL CI ....."
               exit 0
             fi
 
@@ -59,6 +75,7 @@ jobs:
           bash build.sh
 
       - name: run test
+
         run: |
           cd backends/metax_gpu/tests
           bash run_test.sh -j 16

From b08a8630a3b1fafbc768b3cb109e8ab9cceaabae Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Fri, 17 Oct 2025 10:23:08 +0800
Subject: [PATCH 072/121] updata eigen_and fix_bug (#109)

* [Metax_change_ut]

* fix sum&collect_fpn_proposals op register

* modify profile

* [Metax] fix paddle bug replace 'MoeGradDispatchKernel' to 'MoeGateDispatchKernel'

* [Metax] register bce_loss_grad & bce_loss & index_add_grad kernels

* [Metax] con2d_grad use gpudnn

* blas handle support

* [Metax] register some kernels & update CMakeLists

* [Metax] fix metax unittest fail

* [Metax] add group_norm & label_smooth kernel and update matmul kernel

* [Metax] fix rmsprop kernel register and add meshgrid & meshgrid_grad kernel register

* add test

* add test

* [test]  chang the logic of workspace_host in cholesky_kernel_register

alloc(cpuplace,size), test pass
alloc(cpuplace, size, stream), crash

* [Metax] fix compile fail

* Revert "[Metax] fix compile fail"

This reverts commit 83bc87f686227962b0262e044225c6ed5507b824.

* [Metax] fix compile fail by 'conv_transpose_grad_kernel_impl.h'

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] con2d_grad use gpudnn

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] change_patch

* [Metax] update unit test CMakeLists.txt

* [Metax] update unit test CMakeLists.txt

* [feature] add unique_consecutive kernel

* [metax] add some kernel

* [metax] add some kernel

* [Metax] register baddbmm kernel & update blas api

* [Metax] register baddbmm kernel & update blas api

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [feature]  add add unique_consecutive kernel.cu

* [fix] fix some test case due to missing op register

* [fix]  fix some fail text

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax]fix lu eigvalshsqueeze rnn kernel

* add and fix some kernels

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [Metax] fix conflict

* [Metax] adapt to paddle-cpu-20250901 & resolve the issue of 'test_elementwise_mul_op_metax' failure

* [Metax] update repeat_interleave kernel & ignore max op test

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax] chang patch fix copy

* [metax] chang patch fix copy

* [Metax] update metax_gpu unit test

* [Metax] fix test CMakeList.txt

* [metax]change_cupti_and_fix_softmax

* [metax]change_patch

* [metax]change_patch

* [metax] updata_qr_kernel

* [metax] updata_qr_kernel

* [Metax] fix cufft and fix some blas kernel apply

* [metax] fix bug

* [Metax] add github action

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]fix_code style and index_elementwise_put_kernel

* [metax]change_build

* [metax]change_build

* change_metax_work

* change_metax_work

* change_metax_work

* change_metax_work

* change_metax_work

* change_warpctc.cmake

* change warpctc.cmake

* test

* change_run_ut

* remove_tets

* test

* add_generate_pb

* [metax]fix paddle bug

* change_ut

* change_ut

* change_ut

* [metax]fix patch and fix missing kernel

* [metax] link mccl and fix missing kernel

* [metax] rename yaml file

* [metax] rm file

* [metax] rm file

* [metax] add Rules

* [metax] change_patch

* update paddle

* [metax] fix dot error

* [metax]rm opt path and fix activation_kernel bug

* updata paddle

* chang_meatx_yaml

* chang_meatx_yaml

* updata_metax

* test

* test

* test

* test

* test

* test

* test

* test

* test

* test

* test

* test

* updata_enigen

---------

Co-authored-by: Mingkun.Zhang <2496808993@qq.com>
Co-authored-by: metax666 <metax_pde@outlook.com>
Co-authored-by: jiaxinWang-metax <189149612@qq.com>
Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Co-authored-by: chezhang <1376507468@qq.com>
Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com>
Co-authored-by: ZhouDuan <1184319564@qq.com>
---
 .../patch/mcEigen_3.4.0_paddle_final.zip      | Bin 3747604 -> 3747549 bytes
 backends/metax_gpu/tests/ignore.txt           |   7 +++++++
 2 files changed, 7 insertions(+)

diff --git a/backends/metax_gpu/patch/mcEigen_3.4.0_paddle_final.zip b/backends/metax_gpu/patch/mcEigen_3.4.0_paddle_final.zip
index d4835abc3517e181bec2093f8cd2977c8b69cd0d..69d962f113256a866c015001b4c2453213e6c22c 100644
GIT binary patch
delta 92073
zcmZTRbzD^0_q<_dn3?z91RVl1LkuATDj}#~H`s}aEf%<TA$B7c*A~}Z8^wIK*x0zP
zU8rjZ=-T-^@1Cpc=lkOiIo^BvoO^D(`yTUe|HIw7^$(|S+Q?q%Y$yNJO>NR)jy_5m
zCjYMYMHvad$*oLH=)A&~>YsQTy9&_0Z8EppO)o?Js%-CAk+ZX#r6ML*E@;7qDDAdP
zkJ=pVj5-ip?+$dfQ2yxDZ&Q0c$Ie<WLrSd*JG<#>GSEdKQyR9#c$ZfwseoR-H60Mj
z4-8WwbUU<}n(|GV_u`{gX=nFNM=S+83n<8MORMOqM6@Z<TkR357DwC>y4rj<C(4Wu
ziRzV8K&4uQ8>F*tD{Q%rLyZ2d^BLsRb_s*_w`+yEo>*V=Z8iu@G0Yq2Ya^)+QL1f?
zJN%)f+Uy_p(i*C^vzu)Cp^aL`5Vl38MyZi`pPn{*id!o`!|8*)om~TGGVqyb*;bh5
z#<afCf3S|C#9vj`8w`G_uOfGQa6$8$j#6)%F?=@ji_fU}sM2Fg{lafLJG-7?q)Q=1
zQ*C=O+Q_3oZt4c+gMt}0X3dS6^EqXN9M#9Zy3l!fPj&A9ML!(cX&TUf(2%^Rj{h5X
zZgyoBj4JbgKm!*IV{OluS7T7v9}O5Za$R2rJ=tVp(6?>>sHm;nry1@$z^};y;noNi
z(8+^{j^MiJJqMXzJ|5VCxC3{E*3zl$>?#-*Gz&%42&X9bZTe%*i12Ab;iUjT$Tkqu
zhmfBbF5AYQie`d$pI@!g%ceFgcIo~M1P>O-xJw+Q_#Fe$yF`iGwqJUyRLU6J@~>ZK
z(2P4{RM;jw=%EtjUdQ8W26=(3E`&Q#*}5(7DVR&CUX=cg5>7QbF{BM()H2M=;cO(a
zkW*4A8SYUI+l=qJAfM!d)WQ+q@?z5E0dJj`_xVrChGccMg_0aKXZ#r=jh)QDRfi|f
zyFvQq+RMoix*&toZM*Pg7>B|ee|v_|j(D!3g0@Km72IeBzEW}-95uhF-~8;`Ahsz-
zUNu2B<Y2idEy-7NkCASX1NRt_rw#u*{2!3%^q)-WsFwSHqNGBe8_Q9rJZsy)qYET=
zZL(r9S6<eQyzq(gl~#&eUwh<gG;vW(n+8tY9!zS<!Ig`~a7#5jxk{cCmlz7UsNBFP
z)h{_HyFe<e%w1!$mql~?7}znE3ui)d;<;^1NWJQuj{}NMtHa%AxJ~PE2@2|n*bQ@z
zZG^#gPW>}VAktO3-GJN4SS)PJHDYv+nsApfC!|Boxp*~Yyt5s52Kz~Id#(uzvvlAT
zOs=gfw~;A!r3d#`gT@sP;F?nnIyQ*6`~eyqSCKgfc?-!aA7v#up;5e)na*uuIz;@&
zX_??^L%C#RM{-7REm7R#5nNT)AC$>mwx^s!roZhq8N#tiL$V%Mun_YK&Yd(3i*k?}
zkLT9@$D<Z0n9e!;ua}!n<~$h7{?j=a*VOl$W^?Tr$S>fgFfey9*Fi~T9uTK29|z;=
zx|R%dRbkqb#08uS$*L43NR3u-GZ@qBtGKTWys?&ZcEHM1zh!tRDq1LUYErd;)1lyo
z1>9sbYvrk{hwFh^1JtC;247~Dj4gR@=Texb$4Hz?sh5jgzkrdWf<U9y8gj^4`~{ul
z#YE|(YkRqhN|X)xRvV#`<8CnezH<grc#2c~G6ZFY!mgIhW^3*2T2CW~_6wGucv|UE
zA-6<Jg&v&SwW<|_HsWbrqc{@qs);(ayf`)X6PR$?kwkXoLP=SCl#Xn!5~U}}aZwtn
z!+B1Lh%tA*ZQ2b(ZyzMby<cRh$kn(gy;N|KJHm;wQl9hEqF)1IS7NDTg8ccRJLwf4
z<tP<h;=W+%mx``%hwahIg5MNzxz(c7<lGJLvdl%gd4ubyLa^FhE}PMuzsHp*{-Ytq
z&$wR~1eB6{$jwnv+ZzmRk!o_VvwP!AYV3AJ|42Ym6+<i;|K$?wDdyml$*D6T^#_7=
zW+U@tSz?sC^yfP+7bQvwA2|a@(cM~<l<y2qEhi9DHbjydDeFQ!Yeu<A>ppR5ERq%K
z>1h01|0i@u54@Q-0|aJa>PW!hfP(TKDI6N^0RFSHCl}_!D1;tYa<&%uc%?#NLira~
zS5JQk72Vev6OZC_svYGd>6D5x=C*HIg&O@UX+*^iX2c<b;xq$u92FOtGk>V*>{%)N
z$4soTlim40ipuIZlRfpKTqK>VVu6B&eEQbvQM19ADP+!uiZtZTq-p&qqx9Ncv4{zq
z?WJg-Wuf&~*rVGO`o-_M0HI3tGdZ1!x?z+_suZN?i~g80D0e_#!0Gxc&O#aIb%<gJ
z<K8z+agLeNrG+An42w|cF-@jLC=Q|6q)0`@|Cbj(!zUexR>ZQr_*6|1#WM51@t0Ls
z@oW8;eM7QpC~l&d)S8NuC?>3y!Ue6HRP;Eu4n*lU<4@}h60qsP0`#PjVlXQ<pmcp?
zl;1)z3K^AesSsHFs<u(2tFeD`M@3_rC*72>ZNEdFG}j>ak`_^JWK>JYljKecE%QrC
zs^UJTOplfYn^FKhLm@9SB1mbAC=JPL6=jg-cUAOc!|Pi&#a;&P@1bbKa#$Xnq;4OD
z5jAMvM{yoC4C||4qfs&qP%QZmy&+<L`%O_ybq(p1yY&Rb@S~be@+7NW6i=45hXUg{
zSaE^rS~^sbiHjp?AgmL#bonTan51yQ#q9)=8#Wjgw?+?n<trVTrkIUu@Vr}>--<kx
zxN^kQ)+(>T)zXw4g&7y1(uO69PD<QE&@?yiR=AR+JcW+7Nhx`XCL(s{=PMp#<DyN9
z?+R>Ox<#=9p@gl9WeAmQRXjuJ{5HiK3P~rnD?Tbjd8<<Y<b-z-Fo2WE-7L{c5<5qE
zkcQo$q`K}>#OY;>JC)uh#sNlKJJQS->IX%+-7U(6<nL0{rYNROmQ(woF#oP47q%;6
zNb-J#R!Z2dIBk!D_4^geQE<V2MMaL%)SX!~<SNii(qWCJmIn8P*{;$-#XUyoaYWII
zs+lI8PkszRd9a{qsFkE%P?$+_FBn$mjwoI$D8H_gnw)fig}qcoUguI2$#6np)lvEn
zk+<r3fDs;~MjbE{7$i}uyWLK<oCldU*d8a|jI9bI@m*oBlfGV1tmRPYbr%(prxCop
z+&=jTf{|U7Mm1%=X>je6FG0CdvNAyJM+*BuDW29_`3-|2&FHJ-7^j?mN}ho$`YY`@
zO78x-@9`a=+XM2-33Q{5DCw_sATI|fkJ9zNUzLAdwt%nl1hS<8lvzM=kRGQi&*NrA
z+B-<;s-T$X8((uXLEa5@leafE;+YO(%z3EN3$yP=`D?FAK+~%Z<W-i+I?tS{B1_U?
zJ|8ez`IN0HQ^qR4;vQ1!I7NAdWqIr@<y1^<J72TsW0<IHe)8_rMzW`Xss#gK<>fF}
z`I=FFU8H=ATek%Rvuj?5xEzY2dq9eP@HY?{yIlE?4#$moopL5KrC_5nQB568vK}cT
zN$OA-`eECYYyiY>Q%=X+kWTJYLhX_>ba9HRNg>Q|4+L^Z<4nhtOy?>E?@^w`(IkC3
zpd4$&z>PVtT*F#}{#NFoG-=;;<qfu&UUEly*WsrQwUFw>BV`hk?D0&QpvBgdH_9d~
z%=zz>uvwAykp_NH{*JM|UpB02n(T10#b&XR0zsvhntoD_U^HjHD1Yav4LX-khHVhy
z;qG$DgT1n&%2PU^Q5``OZ~j{}Y6S3!66HLNCh3kUUMdz;*_iyNigsiUfbKByKWdVf
z8KsuC8C1iWuy99JWoCCTSCxT9YNWTSwZc>p)P@xN`C}%p=BoUDSqI2=Jzm%JtS|W4
zM~`V(;16?Q<#FKXwq{kZlG;&Go%UK*^st(&46<1yz=0(R^-4pV>XHgabFr^a4;Xb*
z$lVI6aB^Tgq(hG|RXTIu)e5T344e_68lzCk+K~gHmQa$FsM3<eN-7;~laeZ_>_xrY
z{rR(E&qheIdllu(4w0rMs`fLahK*EL6{3ukGWgFciF!Lbj&!-E3YFR?sha7L(dHD@
z7!JXRPO7nL$}_#$v|}G3(<9x9B_hy*!o#OTsif!KRcZ~zu@3m~C<o-8vB`2nrO^JW
zJSO-2FqN|wc|OWgUEnE5_RUnSrj)DvGp%#Ln|VCdnD|bIEyl`ODwvw6y@_*Gc@%N*
zgwC5HVN3@)p=s{(R5jJex6>k31r35ZD^)j`;Pj0uEfYL?hw2DVxg7HKh&e0R*#*TT
zmtlugw*@3^cTV-0k+!|2nuRVQ&Gvg!f5WNFW7V(w99cUl{IN>S99HFtstjW!g*{iD
zRw2_buT)#m5^2jj)f^>NLh<*Dk34wvU^ubdQwI{`Jg|ApXO-CjiB>Aq7nxGS4eBwh
z^`onL35(Oh0QE$B#5D)2zcSDgrmo9m^^H^~U@O^FS$z&sv!c{bQCYOA_=bb{OK!^s
z4gYI|Lt{tS9*v<VOhiT@iHjiCuVU2g9Fb2_L-l@Uw4t%OV?4HIOi@o`tzW0Azp&P?
zN7V~i>!Rc8kzR=Fc2B*Qqq!J*Zll+^SUbD(Br@G0oHSh$<v@C+JGe<6|ELegBKDnf
z4(XghR&P)TkIDNZ;k1!(WR8p0jQRnQN}BECa4Hh<J2rJV$7F^#cWB6Dj_L2PhP4Kz
zJMdKyH)gHF0M@#0ox_Jn8lR47eVTOxFAPF2NZ^I>j<hGF=D?8QFl2RVIdT=G?3WHR
z8M#lH!{B&|G^LkqO#&dfDiP^isHO$#-&#nzs;_Eix5QpS^_QB4X~L?aCgajHQJk_8
zSV{Ku(ZuRyC^}|QId^%+bCORvE#%HBs4_8qHCB6yD0Lj5iBloCWRS+so)Tpp?eXj*
z)Sll(TA!pBLp44O-!M{hN{KSyY&IS#%Z=4kW&J7RH73+S8aP1%ha<8QQsg8}iUPqo
zQ#1-Dkj&J;GLRzlo2?niz^8LG<xzs<nX75&jR?KBX{s}X^*c02IjZNCn1q8GaFa@b
zxmF@*b);k?%%EE(jU$s?v`h2Bm!hj~#=mb4OU6-^U<S1i8C&{zRg=ck-ajsODzkyn
z{k8w~F8SxD)LRcVj~r2#qK_IFFjVi8Up4KSH8y+gCf0gQtxZ!>rrCjQRDPhLprBQp
zWNd-$+C*L(h6cPiH0fGCbSK%<y}Se}UBDjo>2_ZxL9NPi!t!6D*Z!`iEcxFKzjK88
zb0O?MqoutrTAM(-m%Mnr>N2q8R49>J>1EYEVeZiQYyZ-s^0&jZ(;3(=M(e|zo*l1k
z$V|GKsO{-QxekAtd8VCcXV)xNUaO$*$n2f4CYsh#+f77%WnHu*1q25S(8jQEt;^8D
zlqP#?&yB-pJs}q_Y5wytKSUeJyp}jz>*zocyRNyFse({ss(&Jqg56P~v~aA}z}RjW
zr(KLDkp>gAg;XpNCu#@l<p&5o=Dct5!rjiUS}N?-%`kgT)T%t_qXlXAGHr?hL63ZG
zQ+ova7ih~@KycY@?NL9<rPh-wgM(panWLe53@JyUtEoh!%WB<lXNq)tRC3L|;Bela
ziXwodktTWRPII*T(Sd%4nnHJd1)ck(vEI7A9OXe?`sgIoTYk(z@*<!nT=&)Kuni8$
z6VREVX5A;&7#ylg#qL?5x<u^05UR^V2jAJ6x-<av&GshTdryl59GfbYmBAnMK?*5r
zuT%e0uH~52|0`twK3J`|!c<-eft69n@T0JAfu-ef6lLh(1!qKZ$L!-#yxV_A1u}zv
zhkf6AgMH^nvs~Vf<ki#}rNhxWR*0m}F}fi-QAS)pZp7PIh<++5c*OfjfY`}iFY9J(
zux!XpP@ySDqy1A5sa{w29JPlnTq6oeW9sR`&?IS7Bi$<GkoC&uKL_%qshX}^QCD*j
zoJHg&>;6z6W%cGd1&8eDj0|!_GP(u0TUJ7v-a;4SK=rxbZvT)J@M2$BB0htCL`Dmp
zKyIIc30B!b*NX{H@2oq_QS>);eyh6_&?g(@F$`OxGf^JW$S%6oO4>cXkJrn6Fp#yd
zPnAbbY8RcG)U&7Vgc{{W_SYRzA^71pT>w+oGDJ7ah0;I0Sk_wy>>I)MZU?ZJwUM4q
z*99PZON+dze}VJnaX8$mRgGrr?lJB;y3Y2ZY+w_o)&+X-(PsI<Q@C_yp3YBUii3`y
zOH}}T$(&5y`pZ!)bxlOBuAPYdX06f%GY{-pqid(ZR<{D(0oHnLyRMNQTZ4}1IyoRX
z{gmziv#jx1T@o{W`9)n7X8Mh*x^4~zSz5;nZq<z);j|x~4RnG1JSjWJE2V)Yx`2+z
zYjHI`oH00Ci=T`kmD<$d8#5KwG~h4WBks5+{BDL&J((ZQLcOOQ|JkgJ24T300SO|t
z8J3^$6D4<iAIbMo%8=B56hDCzWiWTQ@nI!Mbv-Gb&%;XN7My3Cx(OTYT4VTXh%$Hl
z!M4=_C6eT=6aq=c6y88mvUou{m&rHgkfBc&4+r|ldmNw1z|!%23r;V$`t=*A2mu}|
zVTPBFC1v$wVQRAUPL#8>a3Wus(G*YO`!le{6n+r{Z%^f6nL_zbo6Z*?NUF``?@=oh
z$|u|AgP8}&mMy5Fmb|<N13qmQznqd0KAV3;Rk`})V1fj=o)8Q9tXozk_n)Y8q~UD-
z5Q;G5@GhKImR705Qg4YQ&*htvF?0Bn*xhh0@5_|wKaX$Cz!UR%*d@qBq<${{fy&x4
zU~K1fkTu6nKH3T(i3@pel0*10$jG#i|BF#STExTAFr^-`guj4L(o%i{Vt!o;(V*d#
zsxRjoD-g`i<MnC;^H%fq?J@q(*YWp7ITM=i`1AU1kRC~l*<v6uZv^2bFCf4n8~7tg
z_G$w!a(Wrr&8eqVj`E+1pKj5UuP>v#q}KU-eZ=V+F(Ba)F#H1ROcq6oV<1mA@t=|7
zlg)fXre4fe9?s)r=F-Y-d>sZp-402Ctu=P?;mBE=|H%6wa8BXq0ZrnoC@-={;_o2d
zvR!gYQAWOd_|Xhpv6qiw;PriS`cZ`71AKc1PB|!N6>U9O$eTFolvBr#{dEZ>?L(*J
ze2DTVw!{2=#M2+)n<8GvW;+WP0^SQ^Y6i}W_!t#Hb{^$_N4%6{d<J7(e4KCfLv|70
z4(-|ZG5B5?P%q=?b5l9VT*=r^QATp|bCiKRJjs8eUT?cQ*h8MGQaC)={|~S47kGY=
zaGF1k0<~xODoje<v-~Iq9y`Z(;7qcGYYIvi+y_Q!5W;@s)hWI*F>ukoB=sA}oL|g;
zL^g9S@bKuGx@6y9d=>)}|K_6^IPwzTn}H86^C@V}%iW4CTY&i+J5oazZXss-=tQ#R
zD!&&wWnBBIQ0aC40{U->|MaXQK;uc03HYz@0`Ev7Zb0d9zQy05whvxc=gB?5xWck2
z^$uT-#NFoK)84eshXUl@X*f(J=yhR=jVR$eGVuCc-j9LYeHfLL^jfo#$4&uh8%!pk
zG?CPQ_&2C++XsA2MzZ`NuVSTi%_Dv@7Fg2$8NUQe?Z;<)DC-Y;!D~%)@|rxN*X=M3
zlvEdaDh(jBg=mvB<QpHuCXNEN01vL|B=eOQ%KQ<06(Q(Vl>Xl_7X$u)!}K&yTw6iN
zl~ToMVHYEM883)zUin^4*rK6CzuBL?v=<f{)9Be8Em}$vZj2VB(z?PiHZ$yMC`@3K
z2~C829wnEx6;hojlS-0D(H9>(yZM80-y};WF^<uW(t>`1+85DR%n%mnl=38s6G_Ap
zK_nTuf{uDmo<qWk`JkXBJ9CAK*!DbExQ);`BCMn#NdN5bk^mt{up=u+gjfjWvSFc+
zfq1@)gdG%;t}YUSIh~x4Au-)b;P9nmEwLgoE*5ko(Jk5_wO%6VFshr}RoiPpcHdOM
zV`bzhEnO<4Vk~zq7iO@sGdoXEvH+8n!fvD<WO9Dr7^vOtNo1)|frPIS97v)UIB>-(
zp$p=~jcD1MfYMjtFneeiJh1YN4kw-dWF{4?bkWU#-or2m=rcIa=s<Gryx>Oqt`?@!
z+M>Fa(mEGV!pOL)Vz?}07ux%5jqn|zmum$q&RZ8;;KL|xjk*?V?s}maWzn<EjY<`u
zxq>{CMUi>yg+$ukWa}<-47A5!yUPZl0d23iwfV%_(B6tLHqK<_2B9Si`?x{4i_q<j
z!W4wk@`X7FeajaJLW?#DYbm!ovxDX*0=LB6_nxpU+br}&ggcvsT#X#0q@ak&B=evU
zM{@k4HRND{&=xVgwg}UdF>=<d-6G^5RCTK`8lin#g&7Dn+9r(D$ar069rPyVLg3`E
zUFd?H{k98}MY*%@!)u!>gC3O$RPAt5vPDpnBwMrtDX~TCh;|#y@qKp)?i6?bqvJKh
z0XLj9YZ-1OB|8OY(l-duigyTAkgvE?_=5@!bnUW>q)0+l@^+`t89Uob!U}{wOTq-o
z^0-w>qT;C=vO=QWh;JxxJhw}zj;O}n!bpU&cMDmRx2N;*JhEbs5Ko%z5$a;+l0CvC
z%CY8@Q&Bch(nh*S5Z0y@qE)2VULgw6^7aZ75enKTOrr$9Ti;(t0-6W?!ol00_X&}R
z7O|gc+W5@0K~cariiV$*>;vQ9?HB4GTEYQg6GE>J2$K*Rc~Dr6P`Sb%h94{xHdCpS
zJzV=?Qc|e}hlB)b+oi?Vx)a|m5Uk|N5QW&oLT%)cby&!u7>lpXjwa`i0Y;~&XanhT
zM5saA7x;NpCy863{Y4a*@cf7nt(PNwv-FQ0G|%nGmPb6Sj4DSPNspt9>f?-YXHeev
zXpmQWRH#k4?G`F8BO7Hsh8z=`qY9^w3E2pBJT9z9h(95mLFmQ_p%9@JMZ!Xa;!g^%
z5sEw|6jQ_Zt*-TZKk(x=(nSvmR~8qoCyh=EQ>d<^4m7zVKZ8l*NVAz@1!6uc2;^29
z(Scab2uCO<>G2t17e`r5O1^h-EMOOqEec>ok%e}miF`aKOraRklE=Fy0LBh^X~8fW
zdx8C(Rzt0Qa9+qlo@vEG1rEU_7X&r3pOJ6B<}?rwBzG4H;Uu*N<YjVVw3d|pC0s(}
zlNSXQBFBxbJ^MT$FZ+q?M7%Ev*AVa1CBdJg7S6cPs^?n3Yr&QwO>2YwzO^9l7uAgB
z$(Sp`R*LuK@73=L0dLCx;8{ubRpAtJO}Qp?U|R3JE;yio^l6d(w*z4h=0d|QfzPcI
zEfANR!U3eaa#MhF5IHxbr?-U~j74~f&<$--C^D+a#r3)TBrKd9yeIJF+FfBib!o%P
z>*6;+XMlX?3zoX_1O*tV5}9>hctJUn%zuQt)RH9eAF~nW0|!{{)`9$!DRLhPN@>Ld
zA%Mv)dMLC(*-hLR|2`Vn|3PX5i;<+`V_`A!`}|mViO`EDLJ>l{p9<HhQWqPHZrlo}
zI?6RWoSc6q^hb=u=fYvCMf}vrUh;yPSKuZ;<E7w1%9=#0Nz)g?D~fV?_`pGOI-R#8
zuNKJz<zE=$qmrT>$>NtkO0KO`_(&z*e56#%vq4>2xXewXy-B|^p*iidUHWVPDCm1C
zKg<m$8A;&%Dor8I&aa?0#Dd?+vRA@9QHC~`W$u!3-(ZGDd<0*2ZWe7KQ(ynUHT^5x
z(aX41LbLO;0M`J!b$O}>8{A3E8(}r_y!%FYiqQGD!WD$pzZ13~)cO4nJnaYJ8Eun3
zd=Ov<D6^K*J_)dbr{K!Zf{~-j%|1M5LsJmgRGu6Xa@#_IP<|EKQ)SE@occC{zGig8
zRe_YXiFP9SUxic@XZj|rMQHyw;WI+}z6+Zvv>;tvfK3{XLDEW2v_}QVT7`HByZhUV
zD-bd&#q|gsP>M4VN>YjE5wfVoS5&Jex5xLDQ`Zp}q|`&{ong$1sW9pXtD!hda1dux
z>yD4PeReS5=3$tq>)UC>S;+FKMm$3yX^&P6LVv7}oh8haxs#^8VjKzP#f^x3jTfI#
z`QKD;&aVQDM{K1{70`;}KEyjIiZc*us~2ZbEk>WN_oWk1%#jm5jwJSgDiCE5dn4LP
zgLsQF)(#oCM!stK%#L)aB~~VdMllyr+MAf7B*am?jctOHcom_?PU2}Qr)=`<qT#^c
zxm+ycNPHUjL+2unN3<Llu?V3QS8*XiA6&&Hlyv8+fK~xO8i1s!eWI1*wwoA_Xx{GP
zQHo|A_GP<V**B2LeJ}{AKZ88POhjAnA&x^R+*8b=HnnQ8vOb-8*?=qPAMKOtC00SS
zYF^?|iq<>YwzLQEU1cZVF$^Gp=0Zvbz#KO%9Y$8ia$-7C6qgh4QHu26uD|UC6kGpe
z>_R?y|1dVoM?8WQ^?aGfq+s*U^oy{FZ1nI}aW!Rl<LmO=dBD&EZk+1HDkN+$M9JPy
zTt*F_Q8%fg9Gf@tt%fL490c>#!ND+wKlzChP|!GkaSuWX0pfdv;sV952*uk(92n%K
zP0XVb_s^K>RUdfPXTvKaSk#m7;V=c=3lf_nS!}SFhtSPn@hn35A>uu%l>6IRb7iF-
z%JXa-$r>5$OrD1R&}U$I@vBK5?e1&7M~?$EN2t(k$(DFAh7^y9b|rU4LA|;j#?n<v
zt02NkUnU>}!^O80lKMo5FjdNpCB24zluM_Uq_>8OS5Q?HWHv^!r=lpr6L3fv;N?MH
zM~WH9rDsL)J3?ud#4=?SxdwHtEOMMEgX9lx=Sbl9n$%eMgI`7rFs(KYB7PuBoT!)4
zJ|1j8SuUP$VefM@(n1PGMF)@@<Dx~fDq5UGnUeA`;$4JHu`rfovMB$yv*e*r%JvtO
ztY#H)mR9baUHI48Op+d936%MH5Vtt+GF57CocMsMmlIZJrCd3lVCA6ZW=#Tzy^Du*
zmvN=xRb_`!FsqsviDq?Ay!24stJV8ugybZMN|HDwTA7<5Hb6N|62y5_j%L7@e}5Tx
zfT|(kiDC?*^+^<eN61)1oJ$F^b5yNm%a+MAcN7Vk4hzJ8Ye3#p{YTUkuTw~BQ%i(%
zHQG3`wzx+lLpL6Ft4lI#i#17R9dR&acg8dC99^BWiL4GJcHS)DS-Y-S6UnF76$!;%
zF@9?X;=&+^Cb8MDq#0FDtd3~=>xq*nTB|J+cE}f<B7R|+Ivb`Az%`L|^~GQsqVx70
z|D{`vAL%cPc4;6sq)gr{e7vwCFsX=>W`^83IR~l&Wm42Y45b$ByWgu1IRrWo-#L&r
z_3MI!)(yo#R4J<=%p=qV#zx|93Q1=hi30P>t;R4qY4`0WP+%#vDG7!dg~ld}2dOG)
zn(PaGL6vp5R!VFJ`97toI0n%)&BO}`-D)Q8r;xO~xd@M*D3_`&#ZTB6*9zts1bem?
z{TZ0uMubxlin_n8_za<q?Zmg#N2W(Tk3R#^>p7Czth_}kZZAe~v}eS{+po?-PhYlB
zfS&TCLyC9@B~<Sy!qJq>R!Zw6-e6$;RPh?7sqg3~Yo-ACE?lB#rHVWeRzUq2-Bp}}
zN|?Hdx2gSq-b;QXXWSaJ|66xR(W-ec0u#H7<0*z?(qB#Q0>(g+a}XAciQPn<^u4>N
zV}^-6#mC6=V^8riRsZb9kXM(1VhLNq^1a3Pv}5DN@CWkg7*9LGNkW=<fy%hMb;*P6
z(4m))8YAR(N7A{EcnXaS>npZn(x&$lm!q`u{eQ@HA0V!#T#EYpKW+nDdJ@Y3u?7*+
zK{we;eXjdHm8a<b*j|_}_D9B%8DcE+UjKn&ZS<by&lyVjEO;u3D1~Wo#rkNc+(F_w
zYQW7ECl_*1X#?SbuDpUP`AsxS-h+P(Mk#TK*bGA$W-$FDA6JI6;1&*r+DLi0)~H+k
zBJ@nhoq<#~Oca?C+z7E2GqnClv52BCFKE}X51_BLBL^FLSx9P@=t`>RM+cM4QQ}~V
zG9WrBW(=U{*b(Xv6gQGITAYoh-W@H%MP}I>eo@Vm<!43%>`6vz*rR2Q6wRb@ruYQ?
zT4k*Gf#SXD(q$7}A=4F01!5d0x{&c%FnQ60zM91b1X(mr{0G~{jTb-C12HLcg7|{N
zv$6C^Fx2EzGN;X3I><--^KnvWx-HtBjGqj3o*tp`Q^cDHJ(~hk3vGKjRlKj4p$e4`
zZ;+ovgmUsVdaARQr0#^Jeb97RYsx6ni|Jx(jtc6xq2r@TfW2LgcLd3rDgKH4f@eVi
zrN|E1qBH6z`OX&0GgXswU}~iO_2<aT2^oY(8e%DB8<}Z~lihq8hf3BQQAZ}u6XznD
z=X~)c(h3V8m8jbHFAYzZ2jV_9zu^htLYPRW?uUZ5B^QPxl~I>4ViLPhd_-xZU7Bu@
zS51f55!(-1C24vv+J}r;1gj>Qc1_tL5zSuT`|P4Zpm3L;K82IaLRbdWTmplLp0FC1
ziYwje;pR}`xgvmvJ91{+&?PJt)zZ`LVhtXZfxEw<@-d`}bS@M<(WWtlqEUh1oWo)T
zG*3DHue{dqP%%%5=X^hCjO6r@XfMg<sF<ptw)D@~kUts_gVf~iO*oMPz7)rej6DkH
ziVu#9lNI>%CjF$iT8*GbvA9!BpAGG+?q=BmPv?H~qUV_<+eMXRe?`2mL&Upx#qnAM
zdp#462?$1f5Nm5FxMR5fq&;P%t2EGmJa{OdJ!I<qK~#~vqUdsDfl@z{qI`bRYQt$b
zvtI8`4h_%;lGG2PQfjBtKhn_d_x<Xx(Li@UKRFQt2<5QnWVEkD^m<2>C*`~9LmjAS
za?@L1h)|x7{y0K?eD!}&W=%?mR&N4F)n!`wE*U$jPCWx2X=2v5v8UM5T0i|4#&>Ob
z{catloLJ+8-B|~Cnr^1MVJRm@{|+Te(-ZXlnF~HQ(r4(AOL}{KDLPa6-TlQExIi*3
zkUBH37!I{pbkRGDh$s!vmvcn$N|t^BYt6{kA5v1u=Qj*5jfYs6nB>%X`tB?aMa%TT
zo)pP2RkD{$M>|?N0!YSLSnPjardN~R7hxKzyGx&hA{Oq_!xaqru=4p{{Z5fmOKFo9
zdx3~29@Mzp%dq|X{CBjgRCZDyz+`#+t>4SQtGD%5CGz-oPrq1&NlfollsFsJ<m_WT
zp3+M<9_!Ot5+^;=!*v~b{Ol~%YxPJYe$`JyXz^ElHwsDIH~o3oZvkRMVY-(OL~A~|
z5Ml@-sn?@5lE@i?kd0JDX}HfSidk*g$y%>#4Q3|t8*hMjGh|MZ#$cHAV>Foz?HKsd
z$&jr?ge-T%RMskZ8K&4%1^%?Yca&dQ{Xk3xLy%PDW2nGzKA8>hAcV5!{S42U8lU|Q
zM{xWd7|_mhKTsBGFwqiAAjdx14DeP6B`OLrz|}(v?h7+4V_?S!!y1LM1}IE4l?(~A
zpTJuSp(ODktd)x^8Gd~^f>M)82DKM5TA5^+?t)-=cf(LMl6PKeiY9mZ7~HYvb05Pq
z7iCph1ib$gN+4i{pZ5>!>}UtPjse@B=h6QOv$KnrDaMX7G}FsabXoB6bCA*t6WMDK
z&PErKINRVLd5t&R6_hg0&e?`f3JOWz<`^^;5!4+u)K*d+3um<*nqYv77Abf=A2gvn
zbfntTXq~jW$dF_}nh#eDy{n?m@Wx#zDGoMzxHZD}m^D<hvz0E@8V&{ak9heaPADmS
z87)fhosBbj<S@l-^wjEQ<8oH?w~V&g+4cHDEUk>OWc>@kd=qS(Y@-6C%x1<K_M(gl
zqX~cH6EZlvcQ(4g9@?lPcV32T5!tPkafCqG`uZ5V?*xaKE0bo0zElB|biIr5ynvYb
z1B{g-DdzphWrMOHGrF_)(`3F<V4iV_0xg7iB{<b;RlCM7uL#NNNN%f)nJ8C|LjpM`
znly0v&Zt8PFE<(mR|LNuHo~1jN_6Ig@n9VUXN#sd)_T@ts%ODg_e!QBd#afIu1yc3
zV6+W}6Y>pULP4Afo+?lw@4!c~F($a5Dl?T|d!&iuO!mZ?^h{8ksVnkvP8nbAiw8Vj
zyD57(icI+sZIs4UH>o`lb4FX!f&a&7W(s43wuOI+*8FmDN|st?tasNF7J}Op@Msd=
z<^aDr6B&;begXjvJDOk*K-H5y=-#A8t^f64y>Fhk14+46#yE22GYqHhtBj74UuTmK
z^D2E8F{8IB2=!dr+q8|No+;NPAxRd!Pl1^VI6l(SJ|;N+`_CUu`<dMTN9EB^{Xb+u
zR!*wTvhq7f-S{n9D-G;tn$Dw`@54-KCItIUGnG0bORymc8@DVqxfu{Vv%$1gMK$tF
zJ3Y4)l8A>F2jrI=a}+Ti((x@OjlDtEsA_N9U-MyI-r^s{yjQ!;a(94a-C<hZhEgZ~
zR_#Kf3EpTdA}d?Cz;X?SIw@OfQjvK37*TQ&9UG!X80o|i$N#l#RaV7i4q%x}3|bb^
z%aJF^${1m%ucL{lLgr0xyVV=!?f#zfeT*oQp^8zG5@n1*>J{Q>(@-4uK4~+`%EQ&l
z&zLHJW0dYzbo^8v4ZhRaaVDnhPfc9N=I)Lnb{BScgiDXI!P4iRj*~>nGqg#O#tg2h
z$KGTE?OjN94KV(Duw%$CgN&q3a#a6UMr1aD9Ui}og-KrNjYEK%RjdKU+iQWr+F_3H
zkpe33Wu_xsilU%pl4Ex^2&T<*bY^34<vPbymgRJ`mWnZ2(j?!JRgU)gj^&vZ)Au+!
zU~T$m1%G{z8{7)Zp;46zk2wD6hGjeLhGUY5;NHiMc}4^~e{?L<&|<o%6<@BA%FeEj
zm;CORCD#z+Oni(nT4{;Gsg;VNgkQ)S5eVaZtBzcVcZ-lJXq;m0u{`$Sot}6j*e2L%
zz7E0CN=`hMa_L)wlNT$--pNkzeFnL}OGz!A1Py|nQk+5=*{q&Ua{{Tv<5!nneFsWS
zb0ntTW?E5HQpzkRgBpp_=Q%xO^zD{9)p15OS1r*5L*69P<xT|gb_35pFL0`ahCY9?
zqiYA?*~LH&B~6`UjKpxzNh|f*?UaU=c5MIKlNoRYaI=fNoQaV12c7<8BBf(aah_Dp
zrg2>cdm3TN=nVTTcwqp_ok|KXaf&jca=pu(Ht7-c{O+_!i=d=&t|C(Uhr26Z)j*!z
zwEvHuq<h7fqy$%IM+L>v%~qb63$w!HFk&eJMP|D@uM}v{wu!yV4uOgnT*;whP7%_8
z0O!k0#rox)<5e`ZFzHjxG4AA4q%*7YS0kNakuGQ6*3*9em4JI;Gb{oWajy@!>p`q@
zHJq2`FSt19ckpI5>^z{T=UQU)a1swnj;!OnQG*E_lH@#=O$_<XosVNtlZLf)4sb=I
z^ZGd7U?Cry;cRlD7WA%F!Di<zA9)bVDr1CnZj!S%%c0WWolCV8>FQ>F*nU`8&WC4T
zb$D8|4N|#$=f8Edd(&<Eo~M-%D3Q#G1pidn<$PO-GL8G4?b+n*R^<E{gFkege-;O2
z$5bBin_h5sW=OqnI&Td^QXiGedloJagG+)3wx$HROmY_Gysb6Qr}b?Z2v<pJ9o{M#
ztGOtdQ1==x!I(mSZ}eU99emreBF1NIT^Crd)2t^gTqZD~KAl~Lo3M3phKq+kg3^4K
zD3;b`OI<4B!h*&!x1Fnp_;cYxtH-dTE>R6I4h5p?XJ-UATU`&ZqCc>rYav!kpGk?G
z=0bIP4Oi6R>{d#RcRlAoGok;n-W!79w)oAKByyq$tUhkJ>PcdFj7Y*GW7N{rTCP#7
z*c@u=YGmB*c3!>?GUJ>JP6DY%U38L1I}nB{-0I@`fvIq9pzB@MdSIk$Pd0G{O?DlQ
z;yV->=IwxFc6P_985DCT=B}>1^m>+S5i64i=DS|Tw3ZewcKw%4d9ydV{=t?=F}q#;
z8QA%l>l0Qe8=rTLXVrDaEms9oW6lHDiEOxdJa^rprrA_s$*}E}VMKp!h*yZR?A=U~
z{$JNxM%tZRW8h3jCphV?!v>?ho5hLtOncM$>lo1Aoejp9u5NRY<=_uq!y3bTSJP^e
zm6Z(TNoqW}v(VeElb&YM0&Bwi)<6jN^?IAjlalxt2dPtexBg7E^A+9RutYc!?^YjK
z2GqzFIs!`)hjqg^+)XVd)pQ%iip|NUZWGzExmgRhP7YMV)lPLAJq1CY?xahwQ^cAV
z?cE$O*<N*VTg(dY?0#<bSz_K8=9bLB3KQI_Fn#(?b9-P<`NbVLT=^KRy^q(%^v<5;
zcAeGte#_k2GtyIQ-5h<C)nLdF*b=ZT4LjlHYs3_3dc`frl_ucsN8k6%gf-uWR<z=h
zyhbr9vaD_lFI9Nymg_-zNJS3rDOv=BOzw?D1c&*!FV`UGVRL^g=w*35o*wSiG1|_q
z<TBQc9Le38WHyM=NbM@S_ll-C8~$zOk`QWV*J<Z}IFfx|_hzBUt!=LRVK)SQHo13J
zQ;>|??!JtYPu~}N`8F)-BH>#_^y>nWyQ|b~hx<2<c28>k$vh6q`$CnxG>JfqiW)<4
z-7dM$P*5DV<kw$rLOI$VOiLi*v}+QhmvZ*FKjcyN$m8ztwFTKIX>76kZ)WQM<}>WK
zUxLDyCsN%d^DFnytOohMb9eNhNY^iXjp+a%p14&3t6jRmV*vV(&{wPr+s3$(^m$>7
zdxVFFLNCgKj*RWt<2t+yHIkTH8OxLO`yK+>6z3sG6Koz29Z{G5aUK`c)Qa2X-nbRH
z*xBuAC%*%2Au?NOLJf~QF0^~2?}yr<z|6fCE`F(0v1JV0yl4T7#e;1;q8Q5$X&!ZT
zl%{&eTj66My~J3&tcENHd*m5VxNV}xGK_~*G~HveJ0&W2=I!g@uCOX?j%!L;ZE4;r
zk6Es?ds5}aW8}p|O+Bv7Xm^i29!`wW=t7S@R%9P{-{UIU7FWGwwFlU?wJmPTC{dfI
z9x*Ny6Zh7|0iI5z_`3)5<L&Pr8tj&yI(x!*r|1B{ji-zm;OoiEi1qbMLphHp_Wo@(
zC>I3B>GER2*TqvyJX63wWdWWADr!Ka$I=nipgg^6ia6<2J#}PlN(@|+4E3yMWU5#7
z<k-MSXyBR4RJzu}GX_0dv-0N2$KaY|!en`%F+nArfJvuLp679}N|9qcS7@lr6u0#g
zp23tc))#$R+BwEeGEeq=>ww}svOPz!!M8Bi^EDee*@rzlvewd5o}Q=#=A5}3<XqAv
z)+VJ~@@&NXzu|^wMF!rz>*>fG{QjM1YZTS&z2`v(s#Nt3vo=^@QE=E(ekB+d1#rkN
zrGEE}V%9FwdTkYuDB0Dk)R7WBan0&g6RLMgoP6>L+s`zxBs<uvgA-!DsOoi;Ic`To
zuQc>lcz(kP6Cf;ujr8U=6*Qog*D@8QA@H6}U8HW;!Ape^AoMI2HkUd=`g*ZEDCq08
z4mFbRokn>zZuN6>F2C~h%UOXP{gM{RTkj<jLw_iOCBwa#Suclsne-TyVx=(xMt`~}
zSKm-6c8XU+CGDxL8gytmgm-MFJmf>kiUCkePR{UJ>qEOY#=kDt1y&{L)sS%gdaqH;
zQ^As#=#Iw3o%ia6ks$CL3pg&h<Yl3Cp;FMmV=Z77c}^ma=&X|Yve$TqM~{M<4vI05
zlAAyEz3HXEEQ<LpgMSGNrY<~AtTLih;epq8dlVS?#0y?hp(U=<3$HSTK?bdJ%4+QX
zVrN$)&`2y7oI}X%K`<T6E%Q45g`$#_7v)|dQL~rjh9N6*=2f|5Z1jCyu96eAdR@oH
zfvsUOcvXh$TpAqXuoxd{oyt2k8qvk7-eH*31oqBRWZ&=@<9|yns#{Ji?@^d7xUi;t
zu8fE={D+q)m+90t-sc_lGWg>J2xe>fyK^LKh_@rj=;N(ZQpC(Y-WPQg@x^cZcFS)%
zN-gBYR47T_=B*)n0g>d4ih;ANA>JuG(x}IIUv;509eO)2C=bbb(-F;*nM=OJyRix}
z)0cX8Ra4AstL%!}!GQT3{tNf>V`7}8z*XMem@>2Sy{9os&u!iZ?UBi?o!%L03d)&V
z6C25X@7eAMhFtbebwDutmiGuZ1o^k#JKd@1wO73FHi4oQSp_GmGSO#{5s`ZN_`pDt
z9f2M%>FeV_zE6nJkjDN#OsV$%KKD$Bo*3=ZLxbq}xMNm~%g=*HjU9F9`|3Wmn4hXQ
z^hwqt<(w8iEf~MY9ew69%QCz81hdu+eSKPBz#0vnqb`Cevy}&)OGeC!@sjFi`rKr!
zf~NX>VFH5Y`y3ajjZWDalN-ZuT;M3rERiH}7EErX(_>UpwUs`dm8eACpFS^niazdf
zXtxItw*psOIDm|D(zPu<-<kJsN<OdcDdxb@C#)4=4FR7+ge?-(tiC=b$$g(sEd}!I
ze84A@nQ*trr%`2Wjri*GmAN=l@0;x)%C6B|y4a>16!LY%a>utKs+gYx`+G-M-#`|?
zs@}ez8RvMj?~f344Ds#nf}GDK_;$c#kizQvK4D~@NxtxIn=C{*srJYjASA#83$yG!
zvTr`Dc@{PIUEn}*j$U83O#a%_22a8z_(YSEKYi5F@b<nI#%);_-y#EwOB?7bFu#u&
z>08W#Pmi-RII|O(F~OHvzi5K*IxA)G`|ip#31V>`zUGtQ6G^f-Gf$Q;gjM(4)xL*V
zEZS`Fg(o01V-D`{oyfp?g}z6bb%qnZkC^07#l9j_jlb%vWuW^VUpI_qlbda4m4mSr
zV8KyXxGcshWj*yR&$t%6@tuQdQe)8fD-B@7G{{L_5rz`8&g>-3;>_*QFzK1r{3rTo
zOq&xA!y!W&@Nz;`lWv&Ibyd{74RDsc7Q8pWiRP!&)XjW_sTAsKJ}012Uqj8eFw1d)
z0^Twbj|j7f-60X?(>@f$IT^Zn5}s^kQLdkChL1zY-od#nufN%a-0Em%20iO&h8Op!
z^WwXji&)Y+^fvcF6_+g7>w6uPzwV67l#H%sy|k;p`T4J7Jz@<^70(C#;Q48gS?h_<
zE#TWH@U&tb95W<l|L9K2Hh-!_yQRj5&B<N}uDE3$;ecSd2j-e4z5Lu`RLF(TTcO^x
zs*a^DWjze8ZSTzcjL4qW_R{q+hF^|aWx8gMy>x@&NONjuTph|Nl2&8MHsMLx5)aFd
zM>l7^EO3{XK0GP$w!kN>WbktnpQ<LvyCWKTb7&*O^I_s`VYYZF={uX|f4&cW10y*C
zW>0@&F<S!G{NZn5&z9EOEDH8k^QI6B+=i(M4CSwWRHr)UmbZMNz|Js>2ZsiGSFpG<
zuu8ZkjDdY3ETNoMHixVKwMvnEqpCS6i?m!pqSF;EuNY=wWm%4lDHTUqtQ_^-5Vduj
ze2t=vJ^RjTXwj0a?J@o&DaKNS*q*T#uOEV|SbP}RA<hDC@KJV4<1NVyEUjvBLe5{-
z&b@RHIDh(;b6j=HYov)tu(&arMu`@<-Ys)Fd!&W`RG?WZKfAXQs?5|HmJPJCt$yTy
zt<d>iel;MJWYn@akn($DbYyf*%RXdUwU&iPrle7A%S+lws@1W)$L`p=7I?c@7DTGo
zvwTOWMt$aLsYU~fk<-h8FkF5);{b?yNAE(y9Ny6KmPSY#(a2)sL>VXK$g|2f0SDe3
zImlC8gUI{FmS@;m)FrtDI&1vWStZF*icHEUTY@p-QoE)WxFRO=lSVhQs8PJIt=rU{
zK>Zn92cMN8IY(lg$kFDOLPVU}!UC7xXu{;QwET<Elvb89ghsWtd_rhM8_Nf3&$BNj
zJ$3=ZS9bW7$Go<dj}%2(*3J^g<lkry5uw@-nYKP}GoZgDCVw9*0c;&Hc7T7<i(qvc
zmtxtC#P3sNe^N!hcC>uO#&4Z04veEV)dF{BXuoF{%X5TuT`eyV`rOs>FO}DG_uPU_
zAnz{8a8a<Z=bV9)s7l=}yO3*XcMH5NNByMf2^ml6-i*EZST^e=X*Tjly7cY!!^@p|
zTi^{t`9lNJk~B+ag|Zfa<Xc}$Rpe8>pJk0bf=|*dE=tNWqRDhs5-eN&iLtFUSgJhO
z0w?lR?80G|%?=1Q8f*DoMDW#23rwXH!E3(do)$rFx#gRV^4sfbI3RzEwI(^V-V#YN
z{)UxZy>*uU%$8m2E%5FyCGWS<0^ic4;Islu8#N_=HSqDEOi<@9st)OQ8BQ~NB+FE$
z)A~IY3!^!Iz>>)nX?EOFO{8(>-nY7x3gxG{ll*OjAadslEKp8fw7_8#6%=>fvRz9t
z`<-57ZVP%=#4jL_;;SGj;2%q0^mK0S+!Q~kWeWVV6YV|mz*0p)iO-H-UEB_|v^wBg
zKk<#lL5hBC`HiW$@!3x`FO*sm(bLv$L#AH>qN(J9(pnxq@dBzPzp>o0r-nC~@o@iV
z=-oworCuK_IwWhc*MCegbd14{^<RL38na`THHP`3NNEj1?BClJPf<gOJ}hHbkV3Uq
zCnZ(+g@3b2H^JWi@F_I;lPj{yWXElo049jmhaAlnslUNmL5biBCu?VgUe2K28B-g#
zk<Wmbr%Jr68Y#-v+J_V6ZvCW*$4-HM&18we(j_<RO?yf=$mF@-9%3=e8DkN47d%|*
zWfj#_!cMDohds*L9bknI>&eXS=?A?EhN=I78!Q{mu<P}*>LiB{D|`e&?k;oQb+t^e
zvpdliS(dvGEJsILKMBZaWOeI&HN~9M|K!{hSmU0sQB0|J9qRxs?J1x0*V{+nAumV~
zCPE`UNwU6IQgb|SmF>R)=D49bt}U%En3fK0t$#AGpo4Xz1By)TW?ds9$PctuW37#c
zTOVt2D8`Pr_G6#cSUbV`Q3Nw!1hXHq(#K#S#!1#+ALPoGR?7C;o(xW}fgh2P0A&D2
z^)I<F|8srFObh*hArebFCR=;qc$du6tn)FPWY=`-1F9NXIm0@TQcqL-Rc#J9xXc}&
z49TKN#C-5_i<#EdO#83%tZ-RFRzrHRz-mBnC$WyjgoysSs*t_;MK^qu$IeC8nn?0x
zk=0qLmCMlkZvCguC7n(1_He>->j9JeG4&so%fHRUqW1IqWq(6OCaka?#!!Wg68C$n
z?d%%6(T~bV5$mi~F?pnn&DLoyDDUf0>vo>TZc$>jqaVO&hv;X_rIKQ+Cv(cTzpSM!
zcCRj5pED+RZ&`~mS0Bf>cRL7M)6~lH1QbYI-@wY|-@De6Jjy71VRh6{2WCIZ?N|+-
zBJg#|1rI-)<nYegR-`?lUq%%7g1{Vt!_bJpKxv-6-xze*#S1~vR&Y98u%kV-)qe1O
zPikzI=vRv+?+FjTe^4>9+3a_LT9ofw|Mej7)I7L=YKH3-cdcqtV)k<&`z(H$$f3H`
z4^}ypS(2aMb-l8-9Jlfn{bt)EcreN@)`;NeI(}=F2(D@9H(!BZLKD9gD!oj`b*hum
z7m{D0BZofv2a<p<P#-KU{Dv@Ai(C8YR7kY8gP%WZE$-ykTuteFHS^iO36{le%aN5i
z0fD)l{Zvwi9)3rWd!6jx=H>wR`*5iPxJxbi`mIA2JJbETF}->X@`KMFQya#P^!vtI
z$Bp%aZ!^=@unB%MnGXA>`u%QC1>Y?x=hqyz9ZlfqUw(uJnWBOhfBoR);F*5CG)Spi
z;0Nn-%H-2xzX%<I75?;l%s|fqzv1X9JQJc^;N2(3U%oCQlW!kb^S%`f8BOwnY&J4L
z6|0y05BZfxUGhiOSziqb%v5I@ZSqnTD@Zv<{pvAWZWQ@FW}ZKB+D~PoYOE=~>4#o2
zbuT>M7kt>s1wYMFR_doEUmRi$lG{DMAx!_AM}CnSYWV=Sm#-24{h=cnILI$B_m-bh
z`dI1*R|Bc0l^$+B0==-6lAm}J%4~)<)>&Hq*6%E1mj2nVSfJ>8lg>oUb+)tn){{Lg
z(8ZRMrfd9{I-{>by#1T71RE9VALT%aJU>re^M?QzvFqd5@f5%vuPgd*LY~j-@!#cw
z+Fgs?i{<Wn3H}dQJWtp1uZxT-ButK53782wJVr@u<bQ%8@yY(YBkJJU#orZ8UsZcn
zrP<)<AcOqbcbL)|`8$w!L#&%LCe42l^S0*z|1>pHP8saqnt^vm`ma<_?Snm6y_F~W
zw(w;tNLf<V6l;=BjPVa+A^I@ZzlxG#4nNcUP;(f?Z(Yfp3otkinqt*b---U+Q2QOX
zC+<-KqrLFeEP3qgafFbWX8P+<P8)v8qupwFE(tT6{KXxbTk%e@PEyZo|07JTum%1-
znOs8rduXZNJ6HM#`Z50>^?$(zhu0bZ-YkAoF8Hs}nCd`cy6c-wo(n=YlmoBO;wp8$
z?*E5cDR)Y`yZ%-Mf)V%q;UbdUdXKn|`~a0NuMQ5%;}88!0wUdb?cY*Ck?O`jDw4nG
zIEpR-0;O(m{YP=MC#}u^m#JWNp;9IfkzC*V!#zDJPWtFytU_?ccmF$D1P5pXV78_R
z{agd$9T42$9kAOT!QPgD1S@N;6|j+!b!ZZ>kP~IHkPlniJ_Lya7L3zFMYP7MNp&BH
zS(W4fxGYE+hcyrA$rzJ%0mVve?b|6JRfpjD-T^v2g13eS%u`cS+-fzpY=v~{Y+|1<
zv&M?j#xVg$m9+bY`{uXJVMXyg1jhzQhVJnJeVK+{GXsXBq%|Z5?t@ZImMjh!tfJ&W
zwU#+nhmly|^OL(?YMd94%v`kM&j7gFK#4|d4Cu#neo_!{h#_p+8IZ{kBK8NgW<H#L
zGC<=%{c>~u;7990eUX?W0&UXq;($WNWcsCmj|`l0Js=MCdDY*s%4I<J5y+gukgnf?
zV)>mV0VQZ`HQn7iBcYy#n{km;b|=7rj0ui)kh~uROw>?|DjJSW9S-BZdMqhX>)<_+
z7XjG<?WxB<xU&gH%3s0AJ3S<pmzI4B*u{d<P7w$n-lMkeRtCZca42|M7YLhb3i=3v
z@clmurkDbi7@yKx@%1)Ac_3OGV12`2fUR;1+{F^7vsWO8IUPJl^q&hU^3FkC{J{&Y
z6##R&PvBA}w7Mk_HWZZoDgQu62A0?YVI7RE!GR5!R&Ei2O&HiaIxr3eyer*n_5}l<
z!5=q?2rN%ZBcT-Bs}dN@3^ddT40oljSblPk$0W#_5#bp2#4dpXDXSQ(mBzOToUNre
zQ-ca0ER&Z9@NEH@yd=Om!WcQE1)elgr0;_-PV#`E{oD=9XhMxZ2Qs!YM8t7qpdZt)
zN>*SElTu!0Ntts3$EpzINFdxVpsi<@1-jZJ_$V*%gs)z1-4?rft-R-ogxw7M`4s~4
z9mvcWxr!IcjzWG%0~=})Jaaa%iV{KXUxA+(=x`-4ih+qY0@pIx&r5!S3m*nrSnK|0
zfrE^q%vbnx0{t-HKDuO+MXQOqYOJF)>`Pz?heA4h3xpegREs2&4gRbO1y8!##xr@z
zUbb{4HRaybr8B2N-ey;jy<{WHs>g~u&9-?Oq#;4JmkI=@h1$kCAShI_^<%B0qij9x
zDW@&#onF3$9NNyl-X_;;$+e2j1G_zYM-N#EWpivYRs&kjH`Rc-XJl2|1|As)*SCqx
zXK~GJ`78#{+SoFc%7$P(>7Qb&L8bk-DrV16df5c>B*kV#Mqg8G)6rHkql@hoZIu5S
zN*#nwcC{5C;+Sr>zcFBB^9)-&$?0Wt!k#s~Y(rTnEq!h92~6t5yZvn8XwVgpiT`LJ
z39f7D{rjw5HV>(7hRqXuFLzM<dl{7M&XGuUXjpFJSSM0A$W}^~4>GTrXb-c)B@R!-
zWm=Lm%%+yU4z|Hz9+l)d)CQkIpkU)+wptvetX}uelB>XWE&Ce54@#coH;xs^%i*?r
zC?RpN@DcFfvsL3ZL{jmR&qx~_eNl#CqipaEFA7c>ZEJ|U*L@%G`)debTkQ{(MY1$G
zR!8_`s74hsZC_Z-D~z)(KxzV)TVudBk?<3224v_n!FG!wexGF9&Ri%>v%O(7;O8V|
zJ$cA+Gi_Ris?WB;sbB+W{olwoooy?J=ozzZBh?7LTwpt**2_$yf;SC4559yimTw6P
zt3=kdf?4MKQrk5SF>fxnZBihZz0#Jdp+u`yqCq}$>3|>BNzAu7NfGO9Baw#P3eC5<
z|FYAf3V+{VgYS{bTFMuS;Y2#$7O1Q*|F=)Rtu19;oZ0!0$JOCJ(7!MdKy4`Q9OOsD
z_E5jPH`!iSr#PzB-e;_E9PQEq;}1B&Qm+@b3Px0P(N|l7j;ZeuG@C~--6?3OJAzL_
zgLYM-tp3%top18Bv%C1Gd~R0>KB8cAlRg4JU&IU?8+4R$eK|V_9tBW43YP{w_d~Ft
zFlez7!K7nBTIRJ5$AhZ*W9!nVLG>Aj$De{`$6@QJsNhF6sRqvm<+s<t&7(mL<nMkp
zl=ch{HtP}T_T=EN-qb(Roj-zCIq79rjeWFxzyc`nhb{7}cMYZMyMoti5ozk-V7SU7
zBT0`>2lrD^a826_a1=@#Q~nBWqC_zNa`0CX!A<`Jo4qOPdoEkroHy9nb&Ho@#H>Y9
zdqFOgSA;w;(C&#NjDO#O5&OwoetQX~4aX3RlJ?AV>ojU4%*<`-P#eh2zOVy1<{C0l
zKoL#+LQXQpJi|ivxM1tmnjza5IJ<4gRxM>!?nq{rXRh#udS`hQHIx>04f*7X_yo+=
zQKS)pi+c6~A0%1PXH_PKjA8uu%nC6x@OEy9BjP`sxA*rOP(0U$pwH^B3@PK0Y1RA?
z`1&q&kEtN!Ci+ZzwIgH-YaP8S1YXUc2z}3ms6?v3fl8eROa%od`ys=k8zIY>)U*d7
z`SytK`6T2WDzNujx-bh|X9u6J9Bm6Dc@IK-q_j67iy5ZFhmfioO0=V1=no}q>d<f<
za%yi34PyK!dxSpqph}!exwWk&Tue^MkZ&G_k>am5Z>gwKXfHR~z4)z1UjsZt`>Q;@
zl}1O&lt%CwRphy$Rj3DxPf?dpr7yM?XNAHUc}hl;4#&FA46=IK*_~xWqhx8Qd2GFm
zAr*SR@J<`_toHLtA6<AMz%8xY%fJ_9L8`d<2a*f}j^7VV>f5PnuYPOBs6O49v;A4M
z(MN|m-sbD6J9&+2vthyZ_6<V&4;yyv(lWO#(_)5rd|4Xb<;-|({JQ}i51uX;{dPd<
zJqNe%J?bsf&$xAM^wXmi_U+OxE8D^u18ZnHj~RPKyQtFsA$La@eC{t<mHN%S`JIQ&
zX0#ji)o}IECg0(aduMihY}$HP98+57%c#<5&47fx%c|Vo9;53yMPobks$$!Ter~DR
z9pCjZo-^L*J@DTp@zdLtEBJ6u_55%<<%f$TeZrmnPyRmi?a_=S>tedDxn1<ukKB5{
z@Y0QETjpucWGKhRo~rO@&GSRk7Wy9T_q@))$G6IN9X8YKv+slcx~Bb*YqJlHpH*&!
zDrU?5Lxwv&lFq!mH~hfCz47;Jy&qpT)GhVp;of`q*S+6A{od@3c`r}4InnFg1w+D{
z&+jh1y7u7T8ULI(_R8%_y*9S<=bvvrxB10puk>1j@*2(Z5$`P8ec`6Z?xka=JvY3V
z_Q-zH=w3R5*6aA8*-OfOd;1`=X4;lz-v5f#tHt%Pcppe<yr%r%6Eo&`)jc|L_>~^_
z3&+Ovd|x=j{3dhQlS#f+D;JzrrJoJt$2yN1XXvhMzWczSN<;jjJ2dW=uD>@dzH{M{
z;vvV2&J``%b+O*VnRWTz;~X8v<kT~rQLh}Uo-zEu)J2tirVMcADs>xrzxM5-MKxn4
zIyU?Ff>r3~+jP)szD?Pp_s4pzoKin4?{iUicg2JFY~#3Nqi3!f8M$Too{ul>r_}oW
z{>L{N592bQpH0a!{U2Lz9uMXFJ^p9C?NL!kmO|OLnL%ZX5-G}3qHM_;Nm679DJqJZ
z7NL^8w8%tdOOho@LL`-F6G~|HJ9Cdu-mmlh{dYf}_qoqG*SXGhu50GH=AO!N>n09k
zJ`ML{mc0+75-a5>90s$5X+o|q7&5l^4m$bYpV%qH8Dn#`Y|Y=uh@BnR2Cs+Bbo1{r
zS($!5V)FAt%U^w;NBcgXeo6Gq^Av5oD>7j6Enl^E@P_i*muh!9R_nfSi%fRXu^Nlq
zAG9NK=kJ$~o?PEQ`n&5}N~`;dm3}_MpWnRq{q<=(eQdm0?eSnkQBlS3ucI9$n^ngK
zGS<w{FZ=sOoZ=BreExH9dE~xv%b3f=@3Bsi0J_9L*QGbEGx*#VL{D&~^xGv$&A&FY
zcgdQ=Roh>$>QOavT@d7=&%N>n$8#ZP$2s3xjt(B5{b0pC=U>+z9d48GaLLpW7GOK#
zxG4DZNQRYhZ=;OEjJ@$3ULt<S!q~N<JLS6dx<f^N-hH=$d+v#G$?c;DXrYm}SM2Wb
zQ&`s-og95>JH07ev+9&Ex26kS-uyaE+uS_=@<I2#SsRn)HPjtf;8EnAq)!g5T}V^r
z&nSL+LPPNP=AStymeoD?&Z{fwkmyXFcSS+=#;=NvYjc+<`F9<bqa8UHmK)Ub(}3sU
zVgnC;)5IUkc#Mx$aJ_YV#mH069S^s=^}LQPbnjf1Z1-;Y0H=d@=Csg;qwQwtzRjOs
zJ7eN+$2k@8-oqi!hn)M%V^!s!j676dwdeL8-sB0b;CX&VLjDpqUk<WY%=>j{AV%|H
zhObMKn8@!Qdh9`Eg~;=wn`{Gaa9pe2vc~4CQenV_A7<^(jhY%uMwb6gnxQAu^eni|
z%h{{BtCFMh=IS?=HI-NFU&P$bIW9Q<Ztvu!Wh<TgL{#onz0zwve3`oU=h=!U0dDJ@
z$`@wS$MoK_)6|@LV+%^_=<0#DZf8iuEp^-G@370Tu|3Gnc2dd1g%U}N&Z$k2-Lly6
z@Q7zdOrcZ^yX@WDRtt{rGkMABs621)+Rrazt`GiL=-~VAc)+6Ec(LIaT1Z@H^A52W
zlCq&y-GWk|C9cRmP1_>f?d^7cnbI8p>jGaw<74FHlsDNs^SJFylG>NFyrseW;<dd*
z7)QWFRc7mmV~z0a$Y|4tpGqRn&DSsfr0`xymoe*nQEJ2~_x#oAk6RWy9le_3-CL88
zJxaMS@Xk%)cZuwB!|osPzqrHun`_^CYHh80c%`H9oBczZ-n9z1@B661Hy^@V<o39#
zm+s82bbb)my1jk7#$<EG){Ol-y~oE3XX-YqpLiGcCTxTzJR>ltNu;oD<I&1U>-L?!
zjawW2lJ}h+ytlMuRmY-_`Pql>4sTl-7`p29=nIWEY<JFZUz)4<mVS?3*i;ezx~bQ4
zRq;g!4ql_9jaygEko#aJvtvcTnu^4256|9BBn02x=9tiLd?5WQSZI_jwtr5_kIrAB
zGkACZNp<77I@~OB#&T#+FIU9!&5e&=4o4;!d3Tov=cK3^2}^%BQMpVJKYV@eqcv6L
zY<e~k&TFl!IW^+)y!}LLW(X*UA6Y#(G<#CrE9LllBN$jIW?vf5yyCj>Yi7IHwT(M(
zg+87Wdi5ej?pW%HuETRzULSltwB(6H5J$x)lN4tjbLA!PZr>j-J@l34!?TZ(L;YJ~
z^x4};LOk8{Zl{EY)B-MBc;~W@hNxodjWdhrcl-jLZ2J@U`j|}^mvwlZ^IlqCP0a_M
z%J)UF{L;_cwDx^l_gqtW{pqE9OfACwu76g%YABcQ!S*rb#<SS;iFIcB6v>B)t3|gd
z7W=6!m#U@Ej~!N)h}`vX#-fHL7b<JE&-a?p3a9s9ID0kihMu>#zWab|K+5hcN%L3l
zpGfnzC5)Q6-XFf%LEUiTsP8RRktoqdV_JQ&(5oAXX3m8V&wP72=4PFcHB!af^D4u*
zLAJfr>N0O<*v}V(63WG@^c~9En>#PvTX~H0l|GTrA=2|q#O;I6TbI|K?`7Z5$(y)y
zA^RFzbkNa8@y6W|0iI#k!oKG&B|7Sl8b2v15-)zA@^}Ry<YIKK+C|^WbFGkE=9`Nb
z_Xbw$J>JzWv1sPZ&h+2==XN|=9VKbBa`LV50pUrmLy>cm#>eUL3CgNUuOENkd+1HC
zk*PqX>{$OLk)+~R7ZcxJW3)uYu-POGDQp<-l&)*t=}BYIdlHlvMCP(JiNuSvJih;J
z#$GXYV(xmWfemiA(+1;z{mQb}t=hG*@B-Vy8NDK<+SU01K|Oo8hRRMbRORP}=cRBL
zE<H5RF3$d!Yx8{b<1ediyiO%qO5~P~e{Fa+@N?Am^iNC8x?@)wk2I*4U$0nNF2gVP
zYu>4kIbp*~s){y=#@AiZYmKaLT|tXg4^j&}$ad}i+0};DiGS=ZKlg5mwD&6u+Sq!$
z>9kgSVNz^^CiMi@?vGz@>Fn-Xz&Ca$bTNZhFllyV*3ed;LZMF_5@tR`$$9oC^&LD$
zZ+VtQWe2}1QTV)B-D`ptw&<YmncIr$rM_P&HJ0V3@~v4T4m)ipM8^Ia9E{vj$(U!n
z*gH7>K*|fP)t)-{_gZ^I<c{kEWW7z3TOBh0fZkleTWrQR-OgpP{Z+F%(&x%!!qcUE
zR^mY2*_|Uo0sL$$cOB`IZ0`xJli@foel4K><i$4|PRVdCU3^R`L4i}({q8(&>wV@y
zC(k@Mc;o!n4ArK=mKNcp2|0FwFE8sN<xJ1kbkqp2?e++N*p`1Jn763arYpK-gGBYx
zmWP>qt#ProJcG%fE9b4r(Bd@;5%cbr&t)HytJ1nf^XV4-qZ4L#)tYxTXN=$lVWEUI
z?H#W&R%v>bnN>gMTqwRY{H3Mgtv3z{7RliO#=)x>bFV%hBv`d6)QnGVyLFFY{RWZ9
zJ6uX8Mr*uzz85$3Qx$E~XIT&0M%{7iKC(|NzQJXqxJJRE=j@{LZR;d$F7S2~a@)CS
zH6BS$_@QbSQ+Vn9H{bGyX9~+BEeZoUB!lj^FnmhIzC80SQoNp)d}8$~-CYSbTj_je
zwE`M)fkRonuhX?Qigmti%%A^CJCkA6WGfNrYPxC6ZOux~RvT62;2qM}cc%aJFBCEB
zTa~SSU)S8meCCMphF!Dm`yX{*^-u3rym!B?O_eW|Z)2{JV3R+6)9~e1*{pXtsv?ii
z&hX5<+mfkKXIi`AT=cD*YXe%TZ_d)TNiUJiyLdizyRw~jO8tAz<a+mOvA)z3GK^bi
zxl<lT{t42n=XL5gRUL`z-fma9lM-&8<RN!r{?YE+9<DAo9)t+WZVHNWUo7ivc-hD{
z(fgvvH51#`XW?ANkNsNx8ns=`^vjoY%KV&JRArH^G4c9@mty>PNB8uy+;QTd!(6pi
z`cm1gM^>I-56(T2m0TcfRlS|E&~snL@8Lnsu2pRY;@x+ajsKD5*{bV$ZCLXV8~^tz
zt~I@vn_JBSN1aV%R~z5I{@pxx+16^m6Nd9-DZ-_4dpKGIXR6OnzNGdimVWn`^1R4x
z_ucB>2|PM-Ql>_1+*W0&YlV*6A=%53^Y6^Iusxc?w;=dTW4h0_+`GJP&jl;~rttQp
zn!e;*vT$5_U2-w!Unw`nliL$otDHLT#_jx8xLhtzx@WxLY^yF?%=7qSZP8W^mq|Z1
ztJjJ%Hoo}0zeiwf@M-pb!8ffE#}4nfR(n$~a&?_fsLcJFo!KG@$Fja;eqG~BI1w{$
zl-8e%$}#aeW!jqF?5p4V)OhJxleiYQ^s%ap=F0=l?U$abE|D)d$FqY&T`9Ie{?{7)
z60Q>*+Kcj+-G6b+Rp{G~4~w^d+7fWX^mS_U!r5()>yu~Q;tZ2rb+YDaZu7F)A0JzI
z$R#>0KN*=5el}V@oA>J9xRt$naWrEM(O>s(KH<6f(e94B(&HFw<wsv#IV@z2Pd8;g
z&EweKk$l6X=52iWGl`pf&nlfNZsnTOEB&{x;Kv(&{VxX;`ObvU&UXiz*oT+sy1$9q
zm={CQS&@@e`{b<d`y^8XU(@a1>V!uATv_Bt%bb<aP`Wfy&hXfcRGZ{y$IEBjvXN8I
zc_%#35WR4XRO%xoZCO*PIm7uDUFQPLKMF)^@m~u$A^lsqMJaB^u~qy_cUts)m$N-I
zVv;le6fb9D&2NwT5PSWJ3%Qyu7sbDAGutD@##U+MQ}ugnbjWCLu~J)8zy7gJ(h;fq
z*2NgDymKgF#kunDk*l{z9*B7;;64<gIp8hqIL}Tx=RAA;(OLGX4!j!8m7EUpY%c=i
zEtIwI#mjiEGg!YjdNd}$HOsDCuUvPjfj@Ou^}`i$!L+zt4u=bQ>=}ISb7pNlFL2tt
z!zxtmvI$SZ%G;)#&p!A0Y<cJ6w6#*6f5Td9e~XD3-)%~3=H?o4L_U|$Z<$&Ance25
zh`W|-QM>TDWxLFGb6$8F)i##YwwS}Yo_B+#s8_G=hXeWfF3P)&`zy`FZT4L}+0%Dz
z$@4p2`g>~U98$FtPrZ2V+#QQ9*@$=d7n+BQ>~-%kx8u1d<6O02Xo0nY*~i4=PbA8G
zYT}KG12?`ul4~CE*~Vx7ve?K1#Tm!$${8h}eS7S1G1rTN^^Wz|${xE3gx5ycNy*oh
zCmcIj=TE=!Zg#$$iC(M9&e(&^1Ip_Sj;6n^5j-XNxJ@VH+N<)}?T(q=C+!ZF@!9$m
z_>Qr&kHiK@t2BnbzF+CW_&MIwv8%E%c;xiD+BMSMALjV%;n?`{?vA-Vl?i(y<u=Zb
z((zE*<zgdqNY>s{!-3}1_<AkhGW9j<*FWHoin65kF4tH+uidL}C8vnQ$urFl?i4;7
zb(-AybpN1f^K9F-fyLrZQB_uh9AgR&)dvzB`m&?f9#gvzw{+7Q*QD7&PZ~F7$$MXC
z|5NDQ@NvbPIhsaSj-1TSk*}00{ULufKT>ve%Uar>&5jRVDEFIuZ{|x~{jFf^wC}DG
zyNlX0E)_m)S$<LQh5jan;zMB-Pv7M;^>|u^ZrpHt$}RdlIrhRg9{G@}+lMz;BnZ|n
z5Lnka^ys?!(-Hm`J;p0m?Kx~SyyLXY{i5eP!z1#EXWy%qnaP_zZ`rcWpXL)Tx}fpY
z^0!Hm>y_=k<$Wq#I>9G!S-c_QQFhjtTwJcsjidMazsqx;p9rjY&Ye#T>R#LXt^2?e
zN6JA8pWx){@3GaBiwj;?MRLb=HYF_^cI@sQKQTJ+@ZFGI<nlp_w1lqEN1|$ze^<%x
zwcAHr-_pAGxYKPT`)u!V^ER(-@0!-Uct-fYlMj0uId4`(NL@%oUZDGfyH6-7c9t8e
z<yvAVhDB_>-aj|zS2hUGOP%Fdn(C;Wt#)Olszv!Dm#o}Xi!Y0x%UrmfPQN^hduCl^
z$+_cfO_!oqa%uO=v>M3^v;SEu7HlIu{(Qwv%kKem8+O~953lesQIA!Yq1{@y^9M~|
zCR%!hyG;1gNWB(^tEWReXPu@V{8nS!7Ovkv|LhDaB0DEVo|7IZG&6`n_xRm7$D;PH
zzme*}QllStj+Gxv+1XgCrkc0N)q6mwQeXD~wXHhe@`2&aA1PzM1c&?0zH05x&z`kA
zDtX80kkM>oje)U~(lh20O)|s%3N5yQ+!g|1ozHUCh`r;FJTY%VQ%6_d<^1Es@E3M&
zvk%uLzKeB?`+9OQf5qX9y__4BC;1H;c2!b0{rXwouywMWuj7PL$6d#E$IH1Fd+Zio
zP+4u&ux4M?6|J;^cIhJ#%FRjz4pKgS{@xtw54UT7*eE}mA8*xXA9W^uf76o0H8M@-
z3eStAKH_`f&3BW>HnP1)b@l}hg%6{BSMsQ;N5=e%Hr%lneNlU_D|l8)fAfy`jPZR9
zyH16k$vg7hF72`H%zM0}*R~uD;M;md>2IFqO0(^^_%kOr3l)qCY_Q|@;dWd(PdEOI
zL`;m6Nxt*EJ6X{Kd++20H9EGP4*iyBVLE5OLxI+lPg(cY6Cnjs!Tgc;4lGl@N0G7F
zRaFvxm$z0%_c4F68vie|gGADqyvwE?nRAP>6jU{vM1O~6UYB+Brd3qrADY8fY|l0^
zUOQ*>{H-0~Y+K$tZB*>GeiZjdRor;Ve7`gsy&oK3AAj6H(Q-Xy`h5di!nVsc;suwV
zow3(dQ_lH#ZcJyM*k!NHpEA!!7LGnwNM@|*yRxRnXs#Teik-0ZuBt27cigPBcf83f
zbLsD;tyg9&YHfDxQNQ5wkx#E&<EQ8&qDxL@qvLV@jFZ*7UN%*V*p+Q+yke(WwD^VS
zND=qZ-T9^3op~39ck6e?p9(!~x5NIc)0HAShk2cx16M5_7I=<YWj_^<&69Xi9hrG*
zy~+t&iJQfX_leiq+Z`#mUV1U@Y2C@;2&MKS*L}XZ7gO)oEB$ySy<puL9noCI*Q*v8
zFT_mZtj+z|IzG=)x_a#F?CT|?9<m#T&+He_50%>NBDyrwQQY0a`o_I0`S((;`_8>o
zK0-THUSYFkyoVk)^oOl@UAo)LrL$(v-xD6G(luw!y>b5ho>!TIUAcRQ+@FMrHmV#x
zbJ^9Cx3O@Jr#+kJbFD@d4(%CNq>mQfZF<)2mOsDTZ(M@E-O>H^tBTqaYb_+K+BW|X
zV?XV4%O}vAewB9CZhvvR^0k7rR)2<dz@OUVVjoj~@(-2f+RtjSU8fe(CFghi*G!Q!
zfvU$NdAG*b*|x7KGWy|vp|-&=XY1Atg_jiHU-{X<-=N-`yOVcSQ=+cY*4pdEdGg8E
z@^$uS(5@Vel~1-#&IsBuVRW@{&x80kzGB5E+5GJ|{drsRmxn*9S!=42#@M_(tfo#Z
z>hz2wDf1;S8XDBC(-w)GDB_YTE7W0lG^m}*Ty|_*B+W8#ev0j+x~ydE!8|7~=Y8!p
zAKTjVexB7`Qm<T)b-!jlA?v$DFFz~pV-DkOc0zc+fTW#z{^E*kw>V3sk6UbP0*$$>
z!tS4Ryle*FU>9fQ=~o=oD|CFAex;{=X7sIH2Y)a9QajJ^r(|wCeR%k)1=l*;d#}ng
z(@Z#W${*Z_9Qmfv%UED)#WO?g@QD0Nwnd}w6gx!Hww<${QFryA>&|<T9|LBNcq;|p
zkyw)d%uantQ&LIxtO|(?t0kYb8v7U4?%?nh5wF>%?WbCF*kRMU6_?HHKm9yfA7S!B
z<#_Xg9+kM0?-G))JY6dAb(Tha(Lg!7Re@T@@$!bJJIY6tJt7Y&oeZ!ro?lP9O!<<$
z>1D~L51!)pV>gZDpV>Dj*5Dmy;jMzAp!UXsj^{g*>Y^@qJny>^aJjj8w8E)*fpgi#
zUkfHwh9YM;)CvALDZJ};8`mArQsQ@*yuyPWl1CE8)HVuUJ_VPKxAqwR?2Y_&@x{u~
zp=;{-{#lW)xC|w`7e>mTnCxx};u!yQ`eXS7_55Ikp<<~)PG$ShpGg15lkNSs6NN@+
zzYjHSI>(T_tGnI*{$Sfd_rDcK;{6_*+}9MJSQJ5Xn{83E;$7(>^LN$FaoX}H^D_q|
zQdT4=s7kJWm0N!=-XWN1dFpKcQn*;^R;p3T;2oLVpYM!{E^~jml^R*;|LVMImgAc7
zOoxfN$w8VPgG!WBKdW<x=L)81CT4H79(!b2^_6#1OCn!KZm4R-n2Oz)-G`x4htgME
znOS*9ZfB+4&o;T?-g~^LKzK(<rDgTJN{5SACaP1!^Htj4{IZQu&JEZm^+3$1;JLrj
zx}$H_hw~Qa${0Rr+ASq+eKB%UX;t=kg{zZqOR;NOz)uB-8ns(-lB>(d&+X-X<z-vG
zDK*<?k*4<Tknsfx;-M0|#^y^+z7SJcdg!j{@Y0H>@~2I_gnwo(IrdKAH{U$A&R#zH
zWxD#W{pU6xDd}eSyLQZV%qA@}b#%a?OLwQ+-mf?KpY`Z0W*A5F{5@1N;anCuYo&|Z
zY=t2u+XXUDpZQhkG_&tG%=;wdi&Q{DPGfFepTG96#=jpvuLx2!Tz%Vj@ZA{$#hiOv
z)Z->je;$|{F+AhGLC}F7y>EJJ(kNB$a&m?nW0G&L);}xk<&>7RqF`>~n`gh~9Oluf
zxIDnE(0i}%Is3Kr^IPx!sfw3&zoViq8R;2f`$Mmq+P5m|;Rf15x0t_jM}I3hJ*b!c
z+jQG<X!E<Cr?2L>6s0a4eljWh+d-sy{P*wo_vXHJTYZ4dscNL`ly2n-`ml}ho*gq@
zBwYCA^LxKw9^3BIKP6H)%&ksJ^ZHqBnekb1GXL%j@g5zktDhR&)2NQ_{gUAcB^xh|
z#NJz(xpc*4>8j|``k02WRoSbldf(p1OIKEx=^NY$k2wDOw(j3IKi_}s(-ZWJd^y2C
z{XcfhJ+3a3$biSN{kCs6xyQ%ceqBPJ8KKKta$dPdjNDh+@jBRB7xtAN>9Ka`sKv^w
zrnZ!#%a!AVS^G$Q&C0nVtjm&K<zKVNM?WW?e|-%$7_C>aw#|4)l^?M77H8S;WO2c1
z69IVd#lO8Q1x}TD+~k9|>1#EvGHt>O!+Vu28FW6CswtaJqAD@0O*Wp5;%C$0EJnv7
z_+7CDeK=N^hhZqAlFQmyV|-eya)DrZl&qm5#oE)OUpG^6Ax%6tiYFSugtro1Q<1e3
zEKL7vrozkcU8yoKWz612Wzp0=T!f>_Y9Ue`y}?Umn+S7P`w{o<K^^#2K6O3w5B{RN
zl#Ey!61-J5&SLF*3-_unryC{fivG_}rI?TIJ40`1haEy=9@cK%aH5JLeOV&xdtV4w
zdCfsS%s3dOvXq~7`5;k+F3P%exuBvbz`9JlqB5JEJO&^r?VDo-rx8kF$!8Dy=uCZ#
z(IOQe*6rKkU(4g5Ctj#9d%}R;oX9+=kOcd^RX0?UW|As)#{>#Vf=kKVtcZZul8nM?
z6&()pw)*C;%TwVHKm+*>Se6O~wN|BIYKI?wWUfpXu2WHFxn@|Wk|)f%{M?};La;8`
zdR1ztE*l0^GPqb9-#Z3X;EglnR=f3Am6ECR2YlaE;6)isD(F4@P8%BM$w_CkRMlo^
z{7^BU5(@vNLQHM9$MdUxVkhMnuiqrr1g>7=pdaUlLk>&my26b6LaOak+x6+fsykVd
zzuzC}*$+QxpEP6sG+&kurzd7HtY@p9n$2?Oqk?Mb)I&LoRf~98)KN?I%hctXWvX!S
zn{?gKNVRzC7{a)P>Lr3j)7Ptxu#sLZ;9oTs3azk)cVaPH(PFE5bgC7u4ytghhSZ|B
zSrvX+L!RCt-!F0PgQ^((a1h=vL7t;X-V1MiBiUqzE-y+d<>dv5Tj9*zS^*dl+Ki|z
zs)smPaTjn`rOjqtt_o3=Vs!!YcivOV^7g9=v2Nelueytu?C@eD@c1d%MSr}I6&t2i
z#;$l(IDJIA5=B!trUzbvb2Q}ZFPMz6KSkAT-v8W=EL0VsYZU&U_XdTkLo73zS5$3;
zNi*7eR&7&;1MeQH%-2-V=*&kLt@l(-1X+5fopfyYKPUYfRCjQ*6!kn&Rb#nMhq30w
zf`PrNBCJL%AJ)0w2JfEmHe+d57g7^pw0}??5G9RUqy^4?0LEV{GK-rw9YO~Gngud+
z>3m({mZVxU%L~RAK{a>}0og{ug=%6`VO3OAv*uyZFm*NGnXJo?25N95oQx5;WK7?D
zRafMHj>eEmZmm+g$0x|V{C9wd`BHHBOC=w*CG`6BYOMZdbgWlPn=%&apcXmRNx!zK
zt>h#dpfabR>mC%r332AYo@zkyb=?JwEFZOpQ@#c4QtO`@b(_M}<ax*^|Jj|v{L^<h
zmG@q4E<NC;?(~Z&d1jEVzK&75IMuTF6t!_N^65chf7el%I8K@}57Ow;hi>VL(p_%B
z$)~=2HM$rp5Rp}CyIA6_Zl%0!aAG8u#+qkHV@V~tvW!g~YOkiM*|K+PZR{u}@TU)Q
zOxLtn8bjWzad5FlhruUEzw$vf8PXqx?8!%)U~rie{}TteD@MOt3iE>T7d3ZwmWteA
zHKVEe`uUsM?Wy4APN?Zm(f(g*a0s1jI*m_VQiOGBFRA`_Dux>us*7=vR(odhzKnoE
zqsRF_R@0T#=h0u4!->r&i`5UYk=`<#mDH73hw>PSYU(GZM!2k|`X<)>2u=0BQw@W0
z<-menG<7(=!K?+0Nn`aG0oLWgwd&<lqHvyU0}E!@s@w1jF+~&apR@c9Kf;aP%Q|~9
z9H%Z$&j?cIW>~nWv&~|OuM1Ll<Yir62~+2pGEy3+4kyJ)dr?Q!sZ)mt$#EW4r!Me6
zIZVok9#?<Ga)dE)N?m**X+E!pFT4y!)HMn-F=%w=gY?byx}uCdW$N&95b{aGI(0ak
z%DkM+E7^V)PSzx|4jIvnj;qTtq@SvnO?mgSOZ|@!`4Dq>8`CQdHALtdpWvVj^JzLH
zfh+6EcSPNR{_~SMOPC?>S$#blY2kVL(chOK*^jZ&hxIhf>B*nex#&*Kx?=R^LG>%F
zhoiozdy!X+&tKFxPIa02usZyblr#g%w~@+d8;7>eg>_4y(zG&$@>g{lw;)sS-<V>)
z>RXPk(X6XNzc-;ilcl?5LVXph;~A2_)On^Z75}KaO=ZSWHVrtQ&s4-1=g@$2LF8pS
zuLk5Ud1)Y^k)_JI?AO!a<sf}ZdTh3F4s+^<KQvke5hp|B*r_{@;k;a9!PJwHh8pni
zccc;_6OB((s{gFgsGXt@Ej09aSOT0IG#FE~!cL=(!lH?Z8j4dH0cmGKUze-_Z<<Jg
zpSh4)1(P-WgjtvSGd1F;x`C%iBcFA0)~i)eO&C@ewo^Z8jDi!WjKP~4@FO29W>1oT
zw`|1FkYwrD%FqzuB%AkykXz;mrDIr$Ic=)at)(=%7}xG-1PYM1&CCwj+rgX{$xnZh
z3ct(&6(S6mI*lOK(+~7!@Vmg-L2)Dcg;Fgo`sKIq+Z2T+4Ig3hY16i|gAbrT#istJ
zn0&hBr3M_8CtH=@rx84pl)N}z@uCpU*PoinRHRE^`wq@82aRf!OodYAr^W_Wf8-va
z+PlNbQfD>uh$ZPTKjXn4jonj`GT_lH<L0zv_VRBb%`Sp<*(Ii_KSjl*G@GX`KQGkW
zMhG!wg>Rid-pD)>4*zDe9gdVi7-paP2<ydMWzFX-k&h!Y4q3y{^KqctXlNVK)jz>+
z7tVgv<zsYdYdQ+D?C2S5>dawX?zGcv<Y!$PyJ`k<lXmv6apnkycCHa47p9Ef?V4A)
z$UBl2vz}jsp-NxQoJv*cfySD%8QuPxcX?Sl()MV|vn02Et!NK{z7FPvb%-iGDgy*(
z?$<oQLFzen{;7!x#QdigUDJ&X&h<xWhD<frB3`qdj}#i<j4UXCq<#p0XZcjykg?;0
zW(_-e=Z2Z=TV?3*Ab$Gs6g5@4%!sZMgMLyI4u_L5Z8@U}zhWXUS7&HCPhI9`X+D}V
zvZp|^hn*#`>8hs9OxESqdzwO1T~S%13GV_YHS|8$yg8+T-mPiR$D$+snrf`JSQd(X
zTLjJ|3b2l*4u@+>GyFel!aIJ+=L^1Q&SrH^;-f$PvJgFn4)chyA%lNRQ-_yqP`qKp
zxj@j7Ai!$skx9+NQ;&;tYDIIfG^G<-(o<en@oI%nd2KANr90Jy`xj~HPxZtdRV~&3
zOCW8n`z%*CjJ3000y9ShnIE|f8NX>-@G1||%c^Bs@DHKnWrC5`{HaItS84^zkRC=P
z`R2I8qwv=#<WfPHLr<FFw_Piklf1pg^hu{2q>R=~R$@SE&Y~x=>v7Y!vgz?N4E(fI
zree5brxsO+WhggNt7<9_{ZqBLSfME8KXsK25@$INE1Y0RhW>~Hlr)~#;-7M{D^m-8
zF+qB(lB2b8sttz<v`SbFP0Tdk%zQoncK87+qz?TqH#~o;NXug?e1ETNNpf+nVU9m3
zhE@nW>ylooRm9D@w5Zf_o}vZSTBkULn6fh|1(^|`^(EI-_Veh?W+Xk-Iz1(+T(31(
zh?E?8y-V8y*3Cyv|3@;pL(7AorGDbAmM917Qe;3&Vd`?}M=fJc7LEF%wPcE38PO6E
zAhllpBK3V4^g)Rb>!5U$vL^f+f{>mNL;SB6EC!iAejlsW_Jr^`@iE`O0dH9V8Ac(y
zcG#3QWo~VFI~dt=R{`y^DRa^y+VEpzk~YuNmSd@!*g~1O0jjil$(~@yNNEpDY2laE
z?w)$|lf3puKGvhFG_>LOAf%RhiuOto(xKqg=T8S9GfX6yMbnVsyh?jwN@$an_PHr7
z6E@nb2~wV8<7HiG2=Zetmes&b+O|{it=p^(=WI#+d$(%C@0ZBS5ANF9Q!SG5(RP~B
z9TlLxg^N^JZr!uJA8O6%sj7KxtM+WVvxFWmgR)zDc<RX|;o7G;NbxV(lJzy<??p*g
z*SUmiOVE=gL0mdY`|*_akvQ$yQ=ae~)-K~EpEOySH={-$>Qz5k)ab*J+R_Z+6zxuZ
zmZO7B+GA5b4?NW_naQFT-e^CcqG1Et=2He9e%78dMPChT8?p@CX;^-U`LpavPWtuL
zQ}Y}>Kvs{B;q_B{{Zx?3e`}|)vW&ikM<<+Rq=!f69S`gBp@_~IE<tAG>^lxEF=D1O
zedkRcSYb0~&CIzv8>SSEi|aVDlMOTV7HbrSy0vQ|IrbS*^L30^9=V&n8s<`&F{7p9
z-;BgnqM)Oo!m_2bUgzpmgSs7b3Z^_rbkRwi3Z8_A&YG#b^YPU=$TD}oWB%@a;MYFx
z>1HxCcIX^q-KP5!&hC(w@w}|pB$9sr4b+L8(it7BQ>(?YZ<(V5uO?uwEoS)1)tkWA
zG)?C8Cn7ppjPwE>=UL>cgE4te=Q|JUa-voTK4~%K8K)a`7V@$#E1v0;h_NpBe$qL>
z&AOZ%(b>oP4Gw+(A00D7DJ)fCY$?2IU4kM-{wkmT@sG|rqAJCMpNXq}P^CCEK{MU;
zuZ|_*C9fd-Pg;$MpZ(R@Mz}jXBBe<sBzB$DaVC<jS`hAM;-{+5|BFxhQZ^I%#l47r
zp*RtOw1NLpdJr36M!{!<$Ji)aiN8uC2=8a7I1nc;eMk5=J19v~`+;y22W2}UBl#EM
znVgi3#KJRd0{?A#5+}u(FlppMcn%k36TxWWMffBaWec&EUkG7IZi*w3=`tJP^W5Oz
z75%vg%M#F>E9)c?&Lk-IMBWN%gctCDZiPn+5zgcR2ieytA}q^GaVPp?ln^dt;$xl~
z2-EmLiNYTpgm3eK-+S_P5njd*5sdL#hVWf}aPY<g6B3iDroqgqs!ECkeWL({LJ&RL
zLjTfGoq2C~g{}m>S%9*fSU6{a;D0+rN08!2JUn<4S*c@UOZemB|4KUv0ruyvM0i+;
z;z?ZJ`T*gGnUrlrhjjzO{K6D(V)Aec!fC=34?^Z^8^Vi4D89tH;%<cRh){e8E}=IF
zTg?L9!p8>??wLgiA;Po=5e}bC*+<Bq`-<>_IZSDZiD^uR@$MY(WYYK#!s|rAZ|-{b
z>A;fGU82zV03jZPy~F?~Z4*TJyBPSLSUd~igL5f?g!JHCgcr;MCF~he2p7zQq4@QQ
z9Er))AU)9+r`Qo|_iRDZ3~@+On(Y#VO(dW>j-S;Ke$2#o=IJ2pC<!=HULWCpCcb#d
z5aFQt6gT1m%?#n+Ozd*Z0^x&F;GovdbqI?rfL7i%w?#N}0VR~Em)?Z1ku-Qh;lPb=
zl&0(<&da(_OOqwoR|d+H$FhslT}<L(8K@Py%{e5V4(twDiY-womOZV6jQ2-baPWY>
z55jxppp_fy0}$p|NZCR7&I&>J%tEN|+iry+Jbw|?iVa($5zbsh@giyx;t|%6hdg;0
za}?owOdPc|6=4en7~J2R&LG^Q0Dkv>&Oq2p5o&X8@nwX+C{kRBs`vsDPlrl&oC^O*
zv6!-g;QS;`mM3!FBjfdZF}PEdeFJGayo3@!@HgH@SVjr5f?8RD3=}AVmtJX&NSdNd
z2_i&0Y7uT$hH>>?0ZVUJ0o*hHDbhWu0y&g^tOqGcRHf`9R*ZEZte{5OPgqM}=>|1Q
zIN`PbEz%vJ4!W<N6Gk!lr4E(pi~c91`@9C|Zj&5A*hG^OO;`y}ApA*_vYS|&`xoIu
zS};<~+c}ZR#oEyL{!(6qE44vdbpdwJMF*tW;76eUrW$e@a_xbsW)_9An#eCuMw((M
z&==MjG6+jiVF2X2D<XW8i4Rc!M+s?QHI3p=yq4EOc6(?r6t9Vy&HS%BYHMLu4A!L>
z5?p)MPa7aLb?L(D=Z2>~!ftvnu^iVnLU>dUve8|06~c-7FxLHJtq_)23R6WI8#Z}m
zDGZLlJUdc)dhRy#rAg4Omr;y}i?=<IrrXP4P+s$QL3qt_81QGUv2^!x7zMlcdH??z
zh%x{JD;49AruPP5AlV@h;c!F9optgd2#Xs*d!>i>BAjakN-mt<kMJ^M@KP`_7U4!?
z%1$D9K_ZE#!?4a8!VqafF(&jg3eY3|tDqy<S3oBx<Rqhe5i4NsTt0RZ;hCn8KxT8#
z|HHh@ZtwO@XepwrucnC7D@-X?L_*Ppe*&CjUg(%X0tDISAY9ADgc5da<4VB($wj0z
zY76HoXp6>VBhmzIfr{T>j4+2eG~*ks4B=FBkWO~0L|ADxbVS*y8iY$%gFAvm9m19t
zfL9#Fp1dS+ObaPJ?R>Ni1pBZh#ek^pd4n_!S;A6Jl-7ap{x#6kOQU-b7PNw%&N0SH
zvaKjCMB9UR$S#c5O+?^sHgXV?^8?u-MQdSx$hkR)bT3;65l}t-9pQ#`FvdKre<Eya
z4G}msIRokLw}u){jo?Cc10t}InAj~s;@1lBd5-jiZ9OzzWmEtuNm>uhiR=_cc<u&>
zNsHKAq&q4Q(!6Q|#f0$CS3!>`+k%(;@zMz2vxRa(|15{FwH;vBXa$7dGO>oBGQuJD
zP{9^aHAy_(*<|ribb#7$Vb*f=NRk7zpgLU_Vd;%fW$s>EhH&0SC<3OBO#c52IBWs~
zdwAEQM=Caf0cz$Ngx5KO0oOPiggYI<K;njt)0kYH`EQ2xZI;Frv}pUd8I)A_tVB52
z306>{yDSl&aDv*>&u)!yurqvk?5y5EVlrst2Ya$q32uSmeeqM(v^(UzkS#Ed7r~z;
z{rl)dVqz<-TWV@O5I(ULBJjf53t>4IXsX%n9SE1WK)ODP*oCl-E9gFBy$9iLS6Evl
zrG_Ky<pw6p9!4QN;RYt>JdH#6h&!0f*`J88q6a9sRFQ-*!vh9<f#)fNt+zp6*dIEJ
z@QZCQgwy3Nka#+}-kvbv2V1@%=`m0ESZ3l?k8tXCz~2@=B{4Zx$#`pd!2)G(e<zZz
zXW~t>UL)-04Fk_<_%p(zOuU88HvI(-sr#T0)EM#2xk!4ZFT}gCv<TsIzECxSC5sVW
z;s=cn<uCn*d6~M))BG=9p=bKS{Pfh1vW8e=aPOZ0>r17T{_tGi>i<bQaul@rLlFv2
z-G~y{c?W!Sq4PQ;+_eKrZAO9%!cGA&lpJpQBHS4OYkjE=K?rXRgtRkWy$9isflxf-
zE5i`>*$JPmiq=LUJhT&1qx)Jc!h3^Yn2xy|M)*$<EZV}3A4NEJ7vS=r$4NY$Nc_Q+
z4aD|LS=@cW(0v~S`4OJE8(NvVXcodJcSEmU5EMgLHU#|6<dQ(RAOv)G2T38Uvj=)d
zKpA^-cMs_Pn!adSn(RySQ1E-fA^QK@90~f%P>LR5-nxDI5i%tW_JZ@D^Ohi7vlk5X
zgsCHJy$?n~?@Aqn-|vG=_14!%I3x^aj%AKU2n&Wo4t*S5h48U(P!iv=2I2YpVO9z@
zwMDpSKX{_D%@N^M5g?tt$_3#U5m2L&Qausg5eZ6yF8WVnGK}0&ko(fU=cdD`#r!%f
zHi}|FB%fO^^4|hI76o0@Mcp;6gbZv%G-Pw=+I<NBji#(8<Ss`b9C`rKR$%!7gnu(}
zxyM0-qhesvIaHfKVlr(>2Y<&<))HodX44LmI4BkdXT99(X(c2cV&d0B{RsQT!50Vz
z-hM)OC=R6WyN)32cM$s0ymJiU!Gj>pUi2GbpLhuD&tx{_;QM%xe#Ai_yzLO+ph+Qw
z-!QSD>O6$E9tPdpVx<s%c^IU_Pskzcm;lmBnTiN^B!Kj9M-_za5<z<Kp$5XOi6EVr
zu7j}k5y0M5U4)yM_{FQG2wNNl{CJlU!Vj6)xxf@*(_?@aGt5am-Sbt)AgjtfERgil
zBq(q0P1XokC&8G{+?R}c$2=L5P2T1t!i~wWlv&_+24TzNAidyw8p6#aUY?2Yh7+J<
zXXYh@pPm3EWdgYf+oyo^>a+rc+ftw?IsCecuxlzTDcYTGAUwjvI+7&_N1p`UM@8=Z
z7oR!>BkHqC^M5HloQY{oZ6y9rxZ&FhIz^Xg{8)y98$^fi+1r&I5FVvNO9N7z5e_{K
z(%CcI5dL`@q(v>f5RNzl(tnct{)<nYg#t8hXV8Bs{m@wuX1ohQnC~2@ex0-*;bZ4O
z+W&ks!oq1F-8yg(;nXycPUKERSnND_B1SomaQb<W4m@xYVVQK$?dp0C;jDC!c72?I
zutEk%*ZW^aI6niV>t5y}tda>>W>z7>S2ICLZ1#18wJv~kxB9Jrn3q{0<F_m37}6(C
z!;<Rp1^C!y9D@5=_aZ#^YvSI&=QzpAT6U3Hp7mc$XN@*<{-9o>m=npHTc@Q-oP7xj
z?rO$;gqLJNo`&f^LO3^zVnbBfJVsddGDJR40PD`X3=UenZ6~G4{3T7o{-OlkESsW9
zSXh+I`Zt}C_tLZBW5^uum&ky64#k!G&uc^(Ibhc8Nk77dxiFRUE%}Ua6B7sR7(sY*
z9t_kz#u&n%@?g&H%J_wFWIik&kNB}6xA_WSm3XLyKsc!YD&Wm80tky00=ATyjqv$G
z2$!VgJcOmMz$7QUR2t#TE09b-)f5n3bQSQ8C8`MLUIml;X6hiUUIe{gZ>5j$4JOXB
zH$<3n4Lp%PVTy1G6Hf?QAWXXsOOT2k>kwvK2TzpxZ4oxO0n**YjtH0EfSB-jxFBqF
z6QtMkcp_YR6Z~%B_Ct6@F(^q^3PSinG5B46V-LbBZ-J6mE#U}1V&ahY0|>7vfu5H=
zdI;esB_M64dkkS)2AG^2KY?%?19H&d>}iCz-G;tQ-I9**2PQr&mxZu@Dd1=0c?ge`
zf+uTt7a<&6#vGefw@6HW{v^lWSQ&gi(9pXxEzNv~Jw56UEbUuw`Au(^llT`C8%+cu
z9CDXpP271IhVU0AF7S*&*z+FLluF@5gkRl*GPcU)IKrFDL3(#59pPu?;K@GqbcEMc
zz|v`d>m`JnDwy4{Czr&iFnLu%VS4epo{YfsJ=m#UM^w(MnGOsYFWo8#W9+5-D2!!H
ze7x)tQnIWXBG6^nij)*rL(h4yeTuN|eJEJh3Oc8i=rH?&ET6si!OK!NuG#-BpL#XW
zFxS4`=|@N}m-@h$ch71l%LtVjL)gj#Sk}1Q9zeK;i5>QTLfE<%^5f0*FVmRx=|wHH
zfV1TriOHamnEb*i^dWrVlo~Z-+Ew!2>xZx<k~ubkG=<c`M<`vF-w5+Q0(U&Mu#%IH
z;B)KSP)<_!v|U(`O3-!dVTGjdT?5%YT@SA22Jj+0zX5v8#9k2L^ahw>^x@xt|HYYf
zUZD|+SZ1Oa!dDuh7^J=tM_8)~q)l&0Aza!7(&@u82pc?x?~%S(E<(8BG0Z$xV~Pmd
zH$x?)^{F8IyctY-?81$AZUK|h+jK~2GG5HTw4%RhfvqoY;b4jXwsON0X#C)XrAT+z
z6HxM{z!+hdr%-5QCd?6j`xI6ZM{?I8?A8h;ug%3C;jgXm)z$h8Cxm04!FGbfA6JAW
z+F-SiQR{_pK^sJn-zNZJ-REGEnjV61-E$~35xn~mHf@K}v6w9u;p%qCxR%Ky2wQdl
z-hVX};b$Gp6-wYagq=DeRJZPDBK)coI&EiHE{VySOeUM>3-G&sZ6S%tPNOnAjZAXk
zE=ck)Jq|Hu&ykN%nD+v^AS=eR-^}^fp(Gye0xx;u`VrpS4PpE`_anl5FTw84l0k&e
zzJ%{pB)5$qtl0zA%F*~c!WBKB`w;C1!q%@~d8MQF2jRX~Fw8ffpCS5RzXN;W%foNF
zoCr_$LUfyQcoB|&4Li20Z3GdPeFM?${w$1e?i)}dZ7>I6jki$fcDm0+xa=(?V%Y}?
zgsuAk-`cSN;r>22uOU_{i*VdKz#ct|5MI;|fxU0F7~wnp5W$V($_TF+fKcVFS4X&G
z00vNDmJY%`@4><4f%^Z&r#^tim@kI^rF8lSuz2-=8N!P{Ld8BLWr6U$k5I8Wj<1`>
zWO`bEf<dTJvjO3kpP-Cs-*iCO>oa7Jz1e1j$38;}K3=gE;e&(leSOLycZ8+CKy%jr
z@I<)e3$*=Uoe#omh9Ct!FYZ9NcL;Lric1i}0mJZR-O5M15hg~UD`^IM5zZc=L=YbL
z!x3Kd6&&2<9gXnYudo-8zcdcv=x@+Uo9aUdON_!Hi)gbW2;UtA2YI@a5Vrjet?V{Q
zLHPZ5*dKY>b_(H`F&H}AM$RHEH4fYAfnU-QE*l32FMYj;@P-LEd(qO7jqtY#=!RV*
z`3N8WL5U=`eY%RU#!oPLI`#&_O-vkgtOQ}7Ul5ZT+cJa&euFey!aaoZeuF2KQdI~W
z{-GoiD?Muv{`!Y<j4+h1Lpb*@<rG12Y(Us^k`hOp&VBq}d~!B5j+oE!6wylRQR1v+
z+kYv2HydEia~%jPu~SbHty0|x53qyk_O4e5r*cpa5`l$p5jNqZ?jvsX^&{NJNev}7
zn|wq#mW%2~EIvMn@H}oR^d#p9!dJPe0mS^mQH0G1Y9KM+X#(Mw1a%kDdixi`p*++O
zLQ7#1VKH7RY@64xlk*(8$xarT%e>Sef<2sj8j~}oAs^L^=zDz~O{9<bsCGm(TM5D|
z`Kk6qbx<jZN!_H98h)xLv3^z+!s`X7j)b;*EyB+PsIEl6dn3Zmg4C_VnsZMP?h~YL
zA_iJI5Z)$4-9Q|j=s~zwi0VO{N_>Z~-%RQj;{Mi82!EXktxQQDK{!B|x|wJ=IF9gF
zVUT_){s-Xz5vn_p@_~JNekNn`Qv{UUS<j2`{#jJmKadk9@$|BiZ8mj1@!EX{xvV4?
zCuFC=;zWWDBf^0Ady{{<=g51%W>bBMxydu9*E1wOHix>Ku$(m;VP#S3UZPuiE>hAc
z3W?|*Ac?TM7<C&_IkEuZVKFc%s<#l~n7JUG(5Qg0@I0yuvFecu!l&nf$znlGgw@5V
zUc{@nRD>(UVMwS&=_BkQK@BFB@fslfO#&KUDq(_fvLw}qICjenVY&H$Cp}gpT+GC&
z*R4oA-4`pRAU{>IZ4hpff>x@}wnx}*0pJ(!HX+?F7l0CW(=7;lNK^fZ*~V^2$q#8z
zvMyvB!qGB-w<~)i%qt56Kd;yi=|0WG>>mP<w4xl0*-Hk&2;Y(eX`*@$QewIgOxAjZ
zA?e12&>YSkk^e9c4c64#zlv+okLggw=>ChStGFM<{JYJ`yiM;~1j^=09zxh%9!%e|
zJn|2d$`+bzlcl>VgE9dHsu?l!NfLRVS$xR*ZbvJm=q_4tKShDMiYT^C`S(1_8oMHl
z&kJpIgpVpxy@_pq&LO;DF-SXo&irSDr6g}L)RhITSqM{>Kq)(-pNnug6F*;Bh;*+~
zf_DGRDnhtf33{R_;U>b)$`ISkD-4ADltH(f<{gCnRY3Y^W%)l$24_`~;Rt=F3Is<;
z6@sVDsQ!1K)vhpA(DtOg7GX{`=nDSSdW2KdV4!~%evGh;I<(8P=?TKQ>d>xVX>ABo
zHNexdwhj`La|oFf_cS2zB4@jiw7Dh(zU}raB>hYiOfK$vi?EXxG$&v9J;Lv_U>0^c
z@CoS-(*`9o*oOXLGG=XOJ3Z(v3n6AwIuNq~!EgWWvtpK@1CO0qKZfuu3dGFl)(?a)
zP(b>k>|cbJP+{JcYhs`Kf1gH4+dmSc8(FrSs9?LMh5O%qmTi3+XuIynk8m{&Y)7gJ
zlXyC&HoB1GIp%W^exnO(6&tI02#4sw$|mWBB+@OW57NTVrIB>LK9un-D;FYcyc8-K
zb&n##&zDkn5<f?k5Dr)d4tgw7|1UnZ92`+zs{LO|&s+|w*|=y3Qw*SU?_SbJxWNEs
zh`YW92yZup`Q-E*6ND!XA+t+%nIU|{2vYc5gayJ9#vr}^wH3lS#t^BIQX7Pqn!u!#
zIB1J-2NQ3lZ$x<a3Rre7`m!0}xu)Rvub*2HE;5Ax+Gw~VY-C2=PgHbzBK*z_mO-ay
z`5^4O5}I@Hp+CYCD<P@s7w$wjeih)M@4*O*nL`g3MTR0=WDesuZDTmXOIO1{-B1vP
zaLsC{4TWK`2wPgf+92^;Ji=WT&^tHJCL-)@3HU)lGQ#7Q;P<!qRD=(%ff2=DbsAwI
zD=>NQY8t}nR?wUkrkMyUt%bo5$axvzs<qH*X2ZD%Z(0Wq(yOl^Jg^SZM<?P2iOF?`
zE_1dFvxY@ilU>QQG>L_5Ano^>-a$Ca20j$nm{lNbvL2Ryob2}ze!Cu$%`>eQ;kXUZ
zoNvDM2+y~LJUPi|BJs3?dA1Of%E=ZaO|gRrmTR>k{J@U7huF*2iLi$~HG)tG?nao;
z0ZIYq^<ENBo6L8Bv~P9nLwM;%Sh<C4d5@GdZ3Jn(MV}GgvIzpa{q_*TBb%Vigd2TB
zx(_;nlHT*<2+!RN_{`W(q$GDU^v-$tNrVlYKzF4X$Grcm`kZ86wmN}yPzE=`-p<gR
zzYIQve=+gdl|m#Yi#@62=oawg=?xKt7i<NSHF=^4-`om$qE;@Bu#F4kpk;~_!tY$5
zFYo`5MR>O>B!qH|Ji_d5@Od>_PYL1kZlL>YnHs{H?tlx5wMabO#Sh$}IXnBP2-|x=
znMv``M@oh~z(I+11_(!PgD!64H9>fmCt&+JGlVaDGArAX)d=fshX`&mu|oLKcIbxw
z*)|Ao_5vle09%AVdV!M5+czQ{=?#8guicFBOdkkran@FZ(|jN%H}80mcsf+7zMwn$
zf)|pmX5!}#eh54Gf$rw_fk;W8ANW0H6^!sMe+bpJj8KHRcR+=ZZwo`Z&+mYkEV>bi
zuzCP|_pl=@2H{5mkoGU*4<fuZ5Of!<NI-Zj5PIJC%29-qc0#gQ$Q(ylDF^}^)02vD
zSrC}yoPC<a(-E}Z1tsjp_&Frqy$h6x4rL(h7Yt?nX~ZRjId?+@e_hByIAu3T+e8;2
ztQ-O+iHIT+PkX`$fziPC>Bc|I!yMcntlr<I?+k@C>9#%8wFHG;^6x&Yf^_X+241cV
zVgFEQK-z<QB%Ur6e?pnk`Bexf?S*11r2PP4^?eX{#=|;<>-JIOiT$~a2=57_9w2Jw
zw;(JVPE8<Qi$6oSHJlnl*zImdIAT9Fk!b6Gfv{!-HIfLI>_NCA0z7FBd5v&<BsGc%
zyxTX8^_cUidKC2#@r2_&!mpyBApY3%3E}<GFkR6lh7g{A02XhL9(+Z(^Z+Dd*v2u0
zZDW{AJ<Fd6kHmmUccZ@uC&hxv^ObDk|8{mrzt!R(GuD}KAzU2?eHr_c2VvWTFo=9p
z1Q7mo5C;FKrU=5J@laSk^NNy~nT$M4lRP_qf1_*qLe?!j1mCtf(8d4V=OkT9KLic<
zeN77CWrtx^@m@;~8R<C;<D>kj0>b_YfWxmVA^bZ53ePNGHH4EAVKhDF(?VGB2;fgK
z6cUr8iFB~y2*ga6PY>Y@M?qTf<T8XmG4VcWBZOm)!616$xdP$YNuc|9`$~j!lAu4H
z@>w9Pmkfm`Wv>;&&B>tKvdtRdt;a!kw1O>($#F|I=i6~`P&B{+;lvZ5B>0je!jdU~
zXP(=Fa6t+L)~U@EVS`jAHrs~qlT_%P;k{l6yPO2<TjY!I&`D_J-#0rDjynYl-a}GB
z2+yWN34ENm8{rH(=ytCRMOf`Le4tBl4oA5DG}MIbCD91)JOhJ2G%XHc@v~6k`bQ5V
zTzVFk$8K!L5OzEVN{lOxBg~!#(%LsqB77+gYT4J0GYGFePd!F7*quk1Go6}5@V&`I
z_)a<$+aFq42-{}B44Qc&7vV1%5GvwGA;SAJL3-)$B82B&fUL@kxQTGV1z0DjekeiM
z=pxjF;=N@EKVjle)%Or~y99n`NmU{I>k>#G>#RZeNEYDc=sJX@E(5NSXhirb6aRhG
zjId!gU_FCYgj<;S_OIs%yXL^yIiT?Z;U77Wnhcwl2*>Awbj875gr)PK=RI2AB3zjV
zjX(IKAK^{;kUku~9}#9NfZ4#YZV=)00@#M?<Nu2AnnG$Aap1&vgog^LvBYt!9|)hj
z0;PBN>^}%EzY2roZQ6|KQ!8XW>0@HQ?;Hpx7J-rrSp>qW*Pud-iSr{|c@46vty2hL
z+v^}5vTGK?15E7xPIMZR$7=T7fLhh-A&xNLO_*Tlu~G=<GVysgIfPdh!|a=>u7L1s
zCQeFJLOA>uq<xZ(8p0wafDe~xBAi_US!E?oMVQ6_yxdj~;W`GG{Bd;|!tS>rPxKEM
zA^iI`6eaiDD-cdA1y6ixS0Suf1~dC50ZWAI%3#&7Va_^)x8H#ZaU@_p!ee)!LZs=~
zBb<B}9Go<CM0m+P(EU|x3&M5x!0)pIt_XXTL+?05Y(tp40_wM1r#HeED`3iKyy}mz
zZY3l{pUAFhOzx`HRl>J5z8PX@SIxQ#zD}Y&mPEU1<WFC}RZ)$I6@4+&n#hdXTnz>u
zhlC;gsT!v5;gCp#_uq#xJLr#<NYudhY_E;ur=`imLlrf!@!1e2JMATjjUNCmTBC^Y
z{Rgln%T!WD*t!;W3csdkA^fZslnCt6McDo!NN05#ApC-ft;S6e-ckpxJV~)cc(@Lt
zdotYy;m}8rh-E7s5EiTl>FgFKgiqFkgIjjFAw0K%>P+~~_d+<W0lx8@JL-?{!baF@
z-OC<?aAqS^58<tQ5Y}!2C8b*;5H4$i^+k?B9KtIfgC|#d5)f{94ByDd)FmTq*9_A0
z-=0MHMKg3-txFoh+gl(e25&APJk$d9V25}P!U<2HcUGqqA}sV2{8qHRf$-U<u%oOQ
z%RpGF6=DK^w2bhzR_aEA7hcNv?^6VsgBs6Zq?`+>MY!}Cn7ql?h_G23^kw0LCkQ`m
zgJS?+wzeZ|_8c1TUeHbA>Ah|86z;%tswQ#6V6o&scgRs_)D9mko`oEp9zZ0%(+->d
zjhjy(ysU$|j;P~0jWGPVrZpi%NhdM6_$QSpcLM&t=@OE@)Cta)hvXwH`vN>!9#n*I
z+6$28yIhR0L>Gk7L9YzqOI=hOA}y-|;rZQ=%|0J%5I)mQwIcYV>k$@uNp&DRi57&9
zzJ$Ku-qMEf%pM5U?$S<#lY5~1+pqN?Ec^;geth#5;kZ|z+vDqdgo$2gj_9>Pgrj=F
z<ou4W2(!J0R=QY>BOLJ>0{dtEZ-jp^F)f=7B}DKWh&QJqL1Hq8jF}(W$+w>gyroiz
z%92Gg%p97|FY;{`F>m3J#F<lCNZp6Gu;a0~MHS(IJ{Z)0Lp2c|?SnyW<V&44P9{?H
zJ5ch`S{F$Z{V;PW@h_W}UdD74UK%AqXY^C`iBY2rsei7L_sG}PXb(^giCY6I)AyD#
zpBv(bljp?)u!ntnfzSMZ!-TxI@IBP)Vi`w-Z@q_As$-!u!d4#u$G&w%_}vGXmwV=T
zA{_G(YSYbJZ-f_of})?wxr4-Hu*nw4GT-ym^ohEHP`SPzJ)-*=*3xqp1|wYi85W4n
ziJ{Y&)Z{n_OQ(->{znOkKMg`t>06?a-Thynw&?36BPCozpycqEg9x7*f;Cd)Py)iL
z!{ABR`6Ps^hM}qTB`GAHZYp%U1U+zsYD)a+I*A_X9)V?*pT=4A2-$~mU!f06D8=ZJ
z(XUXaZA&u{j{gS6{p7O{o<9mJi=AUR2v?25M~ZSr0m9zj!JXGPiV&Xo4kkORZX%pG
z28mme&mi%1d!cKc3B99LNIH8Qg2u0S4`KZYh`hZ4mTs7U?vp-U^Z)9WpzrwsX=9OE
zi|&y@`~3rgmU;Ohvf}#_7Nmov_2?0@YZm;1t})%x^6wE|W&*AHE77Ase?3AKrDy$u
zVy9c(j2<RmVWs_>x|--KfA;TTPBQzlena9sbjEn;9~dKLz3oU7IT(ijK&w_pyhfTf
z|AoQObowR2AO1o#|L|hz{gV*Q*h_uL0JEJZ7t>4$g%{udtLS7ZZ95Sh_u;>kzL1T!
zhd9&w8R2R+8tgmW8AjNFod%s1f>hH#u+w}Anc^R$^ndb0f_{pFwu1Qii6Dy?`OOp=
zH#Sb1C$YJ35*bM1r1=pt85~G@Ar}qKC4^ul#a!Ty)Nj7&!bs{SqwK;Bb|)w2AWaS2
zVBk`wFv432Fp!mirH2VH;L<b?86dMMjR(T~MQjPu#KTM5LELMTLO7il0{ZBrEW+A+
zU?9d{9^nUkv;d;5ZZX16{19i&=gK6Wjx&r0SK|IU9V8tp0J_5hH4x?#qy-WNKeQ3P
zC<qS5ub@q1aw?+<(Y%Nc+l<gu_E3nnnW$cBim=5@8oYHwz<e5$v(b~8G&iDfk0rvJ
zg=tQN?Ur>2zY?ZxBP!H3AnYeXb0L1{Iv_mC#E%>{BOE>p9F$|*itywt@MM{QJHm%&
zLpIhsc_KV_4m7?qzz5-+IW!MKKynAd+M+af;?lrQgv*(@GITe>tHnTxN6}t{+n8AG
z+<t^z=Yrqc<)ab)G#8|4PH_nDnFrkvu6P*Xzw;oljKCuZCyIk^CC6lhMI~rki09oY
z2%nPxlSQ3$gcT)0_gm|82p3DzHWAOd(h;W22M4RXFClz)KEyj}Je$PR`Av>zPbnDB
zo3~m`=O;PVpGeVm67LQ!L)d2lWOKTU5yF!TU~rZVnj(Bs8b*r4EpvnwWk6a(1e?4k
z10_VW-+Ee_OmZ7p$T-L0eAK3AvJkK33$G#UA_wKz{&+FMLvj#C`!NQ>;R|7e!ylI-
z%(aN-OuTHZL^xp)Z6h(uqy}L=dFYEH`gI5=%7c<K(G3U-D?r*Bls-oIoC5g$^VCy>
zl@wtpe`|k^@Kr?^ocr2dAWU5h&B+_@LAYWuc%th27GcvRkSwXe|6g0@9naPO{{NTA
ziEJ5(C{26zI<HVDR4Cd~Qc~KqL(!hvoV53plBV_^8k+A0?b1@Tqw#y3bNZf_bA5jA
zKfd4F=k9S`&+B<z*K?fndQmCCh0U=U7JYh4aEt>2y#L?>!DS9eEW76u!O4yo@@LdH
zg8w?gNr*$GfJq~6b3!BTdA34bexVUfot5JlgNkkh=Q%4!F{4^X5gh8GOl97E?@jPW
z7v&UYLVPU2%Uy9u=aj|~taO9#f~kWDzTpP-$umO;PIO1U!rPA^SknV>?>>}7@Olqr
z3NyuQEWvF(q4v|CNboaHsB;da5<J5Ti?bj+o#6W3%CSsA`80x$@Yww3EP{KsP$o0u
zLgx`&*#hd2=?e+wlu)N7FC|#TLVZi0Bls>0wQ4Y}<pdSf(`v0ztA!PA))Gx_Jo-0j
zmh6^T_%Z1k>Z^>Pw)VktJ9k!#;MG3p=KVEw2yWzy-F9JLJ%V@mVigA5Fe2F74{e(n
zUWecUKLlvwZAoyeR_Oh`vDO3^wnCE|71<Kpt2L(cX}&$dC9ScP4~H}(INl$=Jt`at
z{^SpJy^pR0CkNoD>EP~3uvQ@AzC2Ay@a#ZkFQ%g0hhW_{$e=;T)&wtZgN=4-?KTA4
zw8hb`zdc02!cz9*AD)E=>vL_Dt(m}e57c!Qma<b2lD?>D1HlJ_(9MZ$w-6i=tc=(0
zv_nk=|HZ+$WflM4Mevvq94Vo$dkNNUha;u&$NdDaX@?~{BknN4?(MOC9@%-6-~t}2
zTuu_)rUTMdzvK+TmpdRY$5Ss5+%psvYSp}u;Af$jd%){!1gC`I+Pr?jO@j5p(OGAA
z-X(ZtIF5HC$7<;oQsvkYg*)5nky<URpMxDSo%0r9q@Mnr5a5Z>2!bzn!iqR-(Up`L
zza<~dIAj{tmtKzuL*9(Q{aXL>Gxd-H9vq43EQ@_baAhQa$9VCE;H8~0ok45L2)6H{
z#IOHtuORq%7v*GT@UE`}59q2K!+b9MDaTpeu(OvB{4J;4if(AkeiJpwY#Mh*Debz|
zBsjl2dM(IWhv4=-5Z~#AdIZ1dfi3(}FwNAgp2$?}D^o$OUOvJrG&V}vieV;(5l?HP
zP|u!emIOQWLh#?}S`$yg<?ebfoT)nI;l!k6G?vBgJ`n_8i^iTaag{B>(Y=A4ZS4tu
z-5cSI``(1$F@2D<*i8-u*NDMYu>Berf|td>iPJF;g6;YOKhaVWoYNP+FFW`U+%gs&
zeW^ujg3rVvF^4a-A-H2dOeZ8Ngy4t$;N)>}sDRZ45&j&+<o?P4CfU6gF)8hj%W%}(
zZUm>sp?_x_i6U5U0CuihetpOQn+9M`l|d;)?GcZQ%)aRU1mBH^6Z@e92_7*}Ig#lW
zK7?SyK{$p}w~rt=ZxD{*)DEKv?wEj1wW~9Z;EDw0bSCv`GQq0{<HmA3eX@Yn`NE&=
z=*~ni=|p{a2>;NdIgP0Cxgb768#|q-PYp#*Y{$(eYT>U33?8OrnXm6xlb;Fy^2YOF
zN<XI6>NUh&yWyyK&zl<vzB3#f_~SPj1os|+9nZeQVuGKJfcjlfCcz0KaUdrx$R@aa
zBzj<91dS4Zk+?VGbAA(1Yb9ZE{JFPH!0KEIFR=F{DO)n5`=28wCZmwK`L%WvylfQq
z<$5ji3APxGaSwh!NbpJ?FG)X2u<aORV7AFAf;WvpRg`6C3HBI^3|t*>QNZfp3&tW}
zA@(<kI$#`D_O|&~3BEcGODX=&4TAfR$H5Wb<qq*JyeHD1fcHeDd+!pHi4(9pYbHJ*
zCc<{ud?L2PDMb&}CjR`LW!Xe*hYm{~t4#zfv{Ya+T1wAZ!&W-zgr99qhRKNRm&B7#
z3N9Af4c-!bEd>oY<5C&H(W#j9htLXw%TwXo@%k5nr%!@!lRZBOwwMej-dleWoHH3t
z{*)+4-2Q3k%iEo_1S}Lwn2xZr-lkz?HBr<fCLPjo>4=)AL-74{9AIZw7!aH|1*f3-
z2@`^APsO+ooXiPcITf3(_FgN38%;ym=LgjnusTYi?ANEE?B9(W5fh*3INASOWl!*(
z>3C+_@T@8EBzzxX!3?E8(=%!fG5IzFZGOCmGr`kmBF|Qx-3hKc3!CN@J8y!I&O#Ud
zc+V2tc{b88YP280&t@aVkH7d6oHPg9!h^;^1Z&R48IaqdJ;5vIq99h^!wB}6he#I2
zM-Y6G$8*+pB{*h2u4L<0^d$KEd^GTR&prfC&p_Pm4#W~{x&Z1Q8u0|LTY$-L`ZSne
zw}t4`jt(OTKD!Y4c6gISaMU8ShTg@o1ixK`ND}fU5<GD+BH8$I62aO_FiV?+DFm-w
zf=JwU&mh=yDNfOE1#<|#vlQ#`XN?Sk6PBSmIeiw(aaJaps^Qp7Ipr2)A|=J`vk9)B
z1$9N{YJzjJ5TD`8^#ZQ8Ecky!pcjWbV@JEE#N;H0TYaG0HiA1W#{l6oatOY+98KZ6
zbPvHJviW`LLO#KoE6{K)J|84_#R@p7ci;%YZY!}dm<605xNs%Zl^xCy+;0`i>~-)w
z!Jk$klJ$=-6Fh4*65}%L8o~9}z)8jJ8wBU@xcBZm1pBQ;B%%B76MSne7U7hZB?QN>
z!*s%TK9S?B^{D5zt1kpfDzsxgDiqMVRBpi;Z9s*7hrT5y!o73f2HZP;sNNG3n~i8+
zhvVhMMA+1dHeypt+Wv=_u$yofUTXV|;4_<W8ut&ZB)HRNsQY?RCl5AbHPy+~P=D+%
z93H~9JY@^E<!+Ul!q0?{qlBN8Z9!5iv>3IC@GS!Txo|6P^;V^dskl_1nEcs_E6nJ3
zdTK0~OxcDT<h)fT1T)*w$)_(^5}di6zwkI$^DQ{3y8~;sYn;8H7V0DV=09LQzXLNk
zJFqeF<gpVoa1C#+#=_?}7k4W0$NLWHsoz-%xJ{0-Gn076nBZeM%C1a7Jqv<WyYMQj
zOG9gd3w9|xGDce)5bTx<b>ga~YApOg!@OK2{<`3bT=gFY4CF60_|w)_T-)7BFXqe5
zoeiYhi!k4HyOAB+OU~*!3AoW7JTv<ob|?5GkGsTq6WluwIA?^4;2(K-F0ek}M{w3&
z{&As3Ai-Ywh{R%PJAx18qu3s1VFb6@hts@389{Ifj}KP7CpdM#asqSI?I*$32k@+9
zYWs)aV+U}MCsb;ZRK*@tPGX|o)*`s(Av{8vUDYRe!y!bnDcFSIfWuhh9bcFeTzD9_
zpN$n(1a~dKS(cqwpWx>OxKLkgY)|lzBfuACHzBy<2v)Xofdj#ljsh?Bb0Jvg7@U;O
zcPDt!F+61CRe2F?avY5q*_tJI>2X}a_PzE|W1$h9PXK>NYfbP49w&SaB)H#6;K~8P
z1XuCcS=E8y%v0#Wq-Eg*yPpOQw~HkB>S>hWNmf^adz`_?I`<m(B>2BG${vix@!kY?
zKMR~TCzjwlJnj)QfZ(p@fD2C#BKQ`M-#i#faOd-Q=r@QMN$|Jxc*Qj{cND>^E?|)q
z^%_TT!;8R6bdm|)#$)E(B!XQo0h_Z^2;Rrz59QMdZh09;=QPJT1Q%XL2CdI#5FA<v
zeC_uVf(r|=!C$(-5!~?#u*TL^1mC!VjU#&HdV)t@#p3QByM<uGYgolcT2$NcgdW^`
z4Xe2M@I7j^uyyvht_))$JMU9(oo)E7vyjKNrrc^M?V5s<)<u}kqx=JECjvfEguUHr
z>QRDyZUBewKSl7t8#p)KPdZPq$4#hhPZtuL%i{xiMFczC0&bsnhu|%@_;&&sj|pye
z8@23R`GVkF9vA$5OR(1+_#V={oZvk?-t*=w!A^I975-HOZ{u-Xwt{4^(SMk{MV2<f
zYyX2f$Uu+adiQ|4bTKBF<MEA|76hB#$BU;?%dH7sd>?z2!_+1On>;|X&1vFDaOMN}
z-s|B_u<=9S5Ys?{GkAQWeS3m+ih*~x??mtn9yfU1m0*n$;2AZd37*7bi&?P*|K#x~
z(}4t!eT3<JX*5iYg+~7I2)B{&y`u;o^%yysXE=f2&ySJ8I{nfJPI`hNCzs41xQxf$
ztmhLv_$hFR_hN!y^7!+nEQ0$zLqik>tRlGh8Pvst*Av|HIdEF&HiB>SxRTjJaKsDv
z9`1C2;KCPBdsrMLIOHYpnyOO-pLq!<p0zF#-0Bt7?O$IdxPZrpYE;`egfg@!1)k97
zo?0#JK6^`*q0FjFhshs$Zu}azzcbfQkWFdjYh^eSUF={lZAyY~qc>Rdzpb9BeG7QW
z8)cmK*tdjQzg2c)&O81fIQuQSC-2!`0SjN-7JOL0gF5ydLq^Pghf}nzk3PZn@8Kk5
zlPSSl-s92C+Oi(OEz96!`&fGctA{*WhI7%dDV=1S4`|r)2RzkIgan@bfQ!=EC1=%{
z3g)-(E+3U0m~Jx{sQ=|~VLInNVmjx7eAG?^99WJ7#;o@z_-r|bEbALYa8Ly{&GI$v
z2`;R_8`J$gBMFZ91bqKm4}$M}LaOXy`VidnGtQT~On-v!@py;VAcA{-K~rryFr47q
zU-(zLKSmSW^DEYNzDY8{cX`~XX&S*@zoF|VWKJje`ZxHtvzbe9m+wgIy{ZKS7k!75
zS>BlhhyFmmv;JlieDwz+DYjUv#%=jphW<nb&lYVWxacQxvOhA9;D}08r|{W+g6~%1
z4fV3;1q8=b;re}Q@d<)Ut5A8fZD$Ei_=Q<scz=oDm%kAArzY14j{6O@UAx-^KlzQy
zU!HlN;J7~+^7fEN1i$zL-wwLZ3GV+Fv;1KCn&8)e@oZbzzf6uZQ`jC%Zi7#RR<hlh
zy9M9nl$)TzMl(~s{vx<agYCzBE7xcwZ9PJ5rq^JHFat&~1e<F@efFe2!D}_yc&6@K
z6M~(z*q%(XgQb9ljX>~?4@%6r(^_mW^Eb?fn7Gx1$&ZHi1n;SdIrZh566~UlIW73=
zL~x!qi+|K*ln24y4BLlEp2ZSez+*cHKZ0A=f|Egw0tr4-i;ZD^jSnWcy$;)-c|JRo
z;9ELyV&6A{;2ye2qg`cJf}iMOmL+rJ2#%=@yt#h@!Ns+as_yrO6Wm)5cxUs`1Q+YE
z_#)fs@dWqLM<gbrQVD*g&-P;KcuXNU!GImgj82|GaFqcY$GqG+hv1opsLn@bA;CIE
zY&T}R`BH*s8nF>f#ygJSKSpqpKV}udQ;b<$7p|@+__r}u#D}Ua1gDx{@}oB95d7N&
zl{X8|BY3JQ+nGs7IzaF*Qv^8b$`OLc)B*nObCTfqbr5&)rgH?3G(-F3ZM;nIS2M&t
z*6X?&2k|@P7;{82`p6xEKbXUJt)UMHPO?DJR&b9A{%nECKXH0N@EA+r^o-X8f3-xj
z)k!ELcywK;CkK8a_zRD7_kSaJtQEgpW2*@MY=z1<(XQ24+G~VaCe}kOkKfiO_<cRB
zo%#Ds2u`#{f4f^+5?pSL1=wtz4Z+DaC~?I5h6Mk$f$x%U%?X}XAFJWv8W)1~Y*Dly
z+dK$fV#{`6&M#|0u)ZCtvqjT~;6*%+vTseWX#*5(d(SomXEs12re*C2uHO*&^007%
zw>E@t$Gx2icCd$Ux9pw-=kWOW`aT4^Gy)#`y+6Twc>K{ZfnbluNbHol!wEjv7^|W3
zZxX?-O^|Qb%y9(gH-VGaz9|H=P0>EJ@1_xapea_vuZn2|yES9+UOsF#!6%!c*|rUx
zPq1He6wPY=B7zS!XCs+_Im-xkcYu=~!Px}oI-p~d<5m;w;D}T`e7c_C97ldh4c$Vp
zgA<Cj)L<vUyPVM98*C2|?Cgw{wq(Uof^(hG5NjMy5v+7UfbR3p5uERW0G~I$Ot7~r
zuwK|Tf)Bc~J+zPAB-GmtRp@5@AHlobFv*~m4+(a5=jZ<4V}f_`xR34&f}49Fkc+l&
z2;M2+7i9!Hdt&a}l0Ff9z!SZuIqo~bExnM}BfF~zF7QHLTb%f-#=>)kk2jn|57$yZ
zX9)O&H)>`Su0wEO3-s~)Mg|05;PC}FQ-Z^km}Pz~OM<T{5y=lj8-hErNL8cL4G6x<
zvV9rGt_i_iRcP1&a~%l2p~4Q9`qzcv9xZX~G=Awx@ZFYdEK}vc5**`$$<KE4CHSci
zW*N2DpWuPMh<oPzAc9}`qB9>2?m%#?AG+#xUMGSd_@NA+6@3YgX@y$aoEt#!BObSO
z8%%IaYvg2J#}NcSY>l`ZmyIGg)*nMwG@MBA3x9S1v*^zxf)fMKgVwF55&Sa%CFUB>
zCO9P!k$m}-LGX`2lwrW&r2-Z{x$3~5{^Q$VPdaOR#F0NP1uXO+zPi$bsn>TaaWb+k
z0^HqWC&5*1(I*4XuTuLKoJ<WuB(v;Yno38pfd2;JCbzK5KH_9<FguKiKYWZh5k4Gb
zL)f;=<%_n&)20x1I1^fQn|M;SgQw96#ROk%2ldOPPYLed9?uocUcMsuV|#WqllbBt
z!E-yXiHwzV1;GtN*(4_4@*BZNLNR2Y&k7QEL>N1ku?yBFxGW5D8{X0-cuF{Moo+@1
zn{{MIF&RV52;SWhaW7X{5!|5@)M4lA3s_xe;WsG%bYj~xAC5F5CL<#-r&YET*NS9E
zGFP8E5%t<gG--M7^FjvH0|?~{?2PhR)o@jtgz&wW+ZhvU_1asFh5iWaf;IE;^fk4o
z_Pkp7Hr3NEtRE9RzqGm3Qi7-8u4t*WF|E{|1bm|_N;|q?5Wz9s(8U?)p#;C}hAwuu
zk0dy$JC@nrp*;xx+#S9<U5OE}FkfLv;cH?mda!Mnd*;80NzI;kve6myN^K%cO!%$Y
zy*=3w=6>33VPb;2cKoCpMB&~sWdCF`-@GW4=614nGkF?1^g?saG^|bVvtH~t#=WZ%
z!RgV!msZswxPEVT3=>&Ym*9fl=-G_M^$G6LhfQTBDeVca>cdW8mcMRF@X8o=5)&8d
zM6iEfHig-_-JRg)ec5S@)~6N(FNkH+m|H!33HI*CPG-i|4<PtqKXwXJ*)W*k>HXoP
z@Kz^+9pl&;Ot@xGg73w#$xLxj48h3*5Vu~_0ctE1%_bhcpROEC@Y#5FBGcJpB*F0m
zk*cpt#}Hg|5UOLqO(b~ZAVjjtH=W?11SD3gY&yYj6ENgt%ee&48w@84(ial!JcLbW
zrcGK#@SP#>y~iz^;N+oj^1js?f~|)k?!%il5?nA0k!*L{MsW0SOn&B|T?AJRM-RSv
znMW`;0(;wvo(Bo`7zy9w1{@{$)<^_+=g&!k$0nkyT&JEV*e(f?Y<qv1;FC%4eM5Pj
z;C`b};sG0O6I^RFn(bNqeS-5wW5~%~j|uKQ1_3s|`GVj-V=(09iEjyBI~F*;V>!Y8
z<IscNL4OE-J`QF0k)f&HIE2=mG9JeOv$qz(^(G*alw0})ADaLt73WO}j-H4zWbdm>
z@VAL5+ViWn1TRTOV%tSECfFl|9nYkmbR_sj3bvUCy6yyzNJS)TQ(F+MHwiiU(AJmW
zU6atUO%Dc0*r0b}c=Wmc^P4raQYfx&=6X*?8cJT?*J6B&IsVH=wtRLT2BwX?X0A{q
zk5wqTC>klm|LpMpVmRY8Jo+CV9wxce67sn+jkT(=I-jdZXM?%lX{<5R;#8!h5xl<n
zJbzz(BZa~tP@xEydG+NY({T@Mxud(>!ndQZep_ROBBi53(LrW$jw?^c1wLz7v}7U1
zku(KyBt7Bd=)f5~b=2lcrm%X9Mrte(8c&5V@iQ+B;dH+^>T;8(LU@3QmxOlw%uFI{
zZK|oQQ0SS_s53M<<3(&s&aJAM9#>}?YsU099wHfw;Y3cuD8F^83G=71=FCmokwjR*
z3oXp53HO*eF<h_di0R~#F+{y!I@D`ytEtPUvvrtSQ4@*KX$FK_npG3FZtkRGs;l6O
zsZeMrEETotRy$OMS{Cv~p>EZToM*CinXqG11@G!ZdBjWzhXqN(n%tit;(fXGGASe>
zpLhLdF;{dIiop&FMGsj`i>M=RXt0wpmof_}VCKz{+=!XoJqu$SN0PC*oJc1l(Ih5j
zfn*|#{JX+)++%%(qQpa?h>(q}>P6ignT;wP)mus?oO2a}&P<@{K6B9TH=1Rt$K&ra
zm$b*0d6_5_EjlX{v9j?#{*Q;sIcyWAW_GrEJmE%^b_L_DPvOUF%Wa#BM8qArCXJ#c
zXo}}Tqme3VeCMGJTio6%Y3z8-ti@UJw~+Jo_6kKuStJ4JqTT&@tR>^oI9G0Wpx1yO
zjnMb`Sg}zuyYzXY-NgB<J+mQlpWJR+moVK^wH1miY#m`TyL#(IyG9wv?~1wway#SD
z*=gG~6bfxkTA|=wyj=(GVg}lL)3OtCJAK=sTf=K86o<7Gif*#;`s^0%hAcn>H}*R#
z*@?N^xd6Gd*(+*{7qX_zT!TwQ6Sfc<!_%TB=d_cdZ1v2pqppW7pKI6XPkzfVheE{D
zNfygm&S(){3>-7RC5*r?Bf&ygiU0nOuVjIZQn8N9Tf~Mk59U0RViGLI%x-Jg4uXbJ
z3Pp%yVF!y!-l8=(c`<9jq&B!OnTgXqwir{{dXJxqiu1hhWWhCGf}XNXD<Q%@OCUV@
zSQNH<;$$kDbABClJVJlvth`yHWli)JVhWYTGqY6m(Qzqj$NWrqC5>atKO_`;WWJw=
znsKmG$?TeQc}v-L%#DyT^61cD86vCwiTCQwjl%z6CKP;@G<LjK-KF=97NCQg)>0_^
zWM0pG5$(diI_YwTnV5PVt17v%{^0VLo+#ZwgvH8?bARy0uAJLXr`lXLZ){W&FBxkI
z4?R~i5uLJ16k2AnhD>k8aH8py#ad|w)^_G9@*Q2c6<Mq`)92I}B6`G&+ztMTnsX2h
zJ(@y9!#RkejQ)up8abPAmpQ~0+I<>P{^6C^O#Ue&Oq~t6_N$?+IyIjt=P!qHbCZ9{
z|CX~h%(|hOL}Z-}QU7NDM1!)iw)7^imP8HsTr7G0MQ4c)`oTh>2$Ut#%;TRS=izL|
zy<UzuhGuV<ti??FuD}qC??2@nUuS(;3#N?BlMIC6=50IPAr@O)dIyD~rEIv$*8iNO
z`aA1$b}KQQ|NO&p>-JR_YeX^%MFzHOdDZuJ_-DO#C6?6vA15V4G0sOTQSqqGT>2{3
zjdNJVnlKTQFGw1p%j?$;`8EM96@e8ZcixdpTZO0H4P{p)3(@O`Rq$%vgZJvrt?S`z
zAeL)b=^Y}Rv>L)My+q;V)i{ROief<{oYKOZE&dDe8a=r6YgjK?4%3HkSHJ8Mj*`Dl
z-c}l0cx8LWDgKjsvGPB4^xHYFFIGiIJ-Vs?;Vt~Qj2JXv>RROYZcLeEEABjd)}ovj
z`a5%rr@44=f7h~&nV?e@L=?6Tq8j7WB4aLR9rpXU8s7w^FeM>S{7Rgarv0RUO67Xi
zh<TXwo46af9`45dpXm5{wmx(H+Am4efcLbINnD(Tlj<5yM)lsy@1$ij|9S7S0fnzh
z)Nqhq3<@D!+X(L^Oa6%xH?n%%nh2#HvuLWOpcKBjFDM)EN_MGQ*#h6bXa0Zg*6jQz
zO3QK9<C<?mDxDPO#9g0FaQCyC(u#|1=dH)&^fDsKN?!Rsr@GI{yU0Q^7+@w@31^Gv
z_>g!<+>!=3(>r$nm$w;r?!x4{BzB=1w=aw1SOf=mb*+BCOpUOUMi6p7$8U3tF>cgM
zBZZ=kY^gRa{1>b3O=pHP+=|54J7FtXi>=ggD`u5_L#;I9mM_Bmo!uLexcJ+XX5r(1
zN>>9HV_E-ueG4M4lee)}8jhUCHa3WRu#IiNd|2fmNrc$_%Dv9q!HsBe5Y3MNn`+~5
z)3)>Xlqs%~wHW(}?TBd7-|9-+9k_KIDDfo9UOTX`x9fB353-&dLbu>f?m#<i*JmZ6
zaP%f9w@=E!VRsK(t^87Zv7uya!o}LV$T!fr{(f>tu9M9!JjNDSUWeYxpEarGD7mRi
z9j<l`LeY;6kUWU9Yn_9bTewPUV{UB@(ok~>&GSQExVcp|p>Jy!`BO>Fs`i3+^_my%
z!bTyYQFJbA%-!3CFf^uykudaf;XP3;^k%NvM@Y_utYLji|FxA`kiT-dcNPcOGup+J
zv)YXTygrQ}19aSt0hU%5?%9p&&GaTwl290>mqtHBeFKG}!c(E>DUD)>Q9Q97^M%DI
zb-1fNFw}@W7-~#FjNC&)?5Hu1QQdi{uKbbtK)mF^jML6T4{3gilZ?fB1mq!3#Sl@r
zD35huuD?u>G(w1fd-mxy1y$c|N^fosqoo0?xX`_bu2DCdh_t;JpiYV;G~;%sxYXj_
z@#@p(Mhio!3uv1Uwcj*e?Zu7EXX`OZ#ToK&yw4>jc^RS!{S}I?vW%_zXZLYGYsRtD
zNjJZ!oG5q1j@q_$KHgv~?5R*h${YnQ6$20)JzDCbE87z$KbR~z5|dc9kB@psHKW-5
z*q5$-m_~$a_e0q7lq77%1V+v#n$`!P$@@>#EIhzEGi~aU6g<2+bj>i_7JP7fl|Mgx
ze<9i-b%tEVOBVxK<ekcv2!rwY{+}LKZ?k-l7nSmkBAMuknIt}|L)ZXu<2Z<yI4Orv
zp^MSkl2<L^HgO!18f+Qw!IL{)=3*|3!m#v;<Wh*D$(FyDqA;CyXfpY9R#)@E9X*VV
zBDY|z+_=;`d+$9Mr@&bL+!|M|4uJbx?!w5T_&8&e+)>kppT>8>-Q_9nF7hb0ey--|
zcLAbs?YdoV?3&}#;0B_|`=5%${itUA_lHX@SriRIawSJ%FFiPd+^w&wX4Lv9o}Xv1
z`9wJPD1@JMxErVcC7Jtp6gy1Wse_VG$nL>bmcO%*91hnPd4i1VRWsgI&sCnF*QLkg
zjtcMm%-V$no&28!nc7!#wA<cQKF9Pcr{#_w8S0cw#^bLF&g7YPZYDZf<KSw^JwA?`
z^6K*QYGb~+_C4+%eG1!M?f;3w(m^t=FH3S~H+92_4O(ylxeIoY^cI}mNo<dKABu#*
zg#Ae9<mi(S7Q2bU6Yj1C+@X`$xQ^bvB}FBAe9sG2?vl`k>v;;oxm6E|NB&M&!$-*r
zTg#%{ef%}?S?@HYS)!C1cN#m&fc?)TjV&Kw<BR1Jt&oW}el(Q@T+wN~NqA+b_9|Fd
zL{0eK9ZwnO@fag-hX5|=3_eWk)$%<Fng85V11~bq;%PgV(-s~GgsZENqA6$bQ@k8I
z%ldML=UIk(d=`<WHv23^DrTtuIm~8vU(Wazw)T%HNafm5uG(C)ORNFY;lJ;a+K%_S
z_~Nm?U$B-|*QXB>x})SC^khlw8~saiBuuw{V*ilMI2q&dDzUw6lTYB1&g0t&L!K(g
zgeuM>&@nTr4b$lY-k8;?s!4?NFYu+DUrksvA2TYyfWo&a(p5h=33(C<KXwtX_U7!0
zU3Iy%ix7sTH<8>5Q+L0&&U_dOv<cg={DoRkbz_4|Y%^wjPh-hg%yP^n#FU>^ZIr{8
zSX*XbKXbV->oRmjhpXwLFXJ+6bFrSJ6QYSc=UU~0{5`>yguF(&P|f)7WxT6%c4#0O
zi_!QMVw8*<)r7V;UCp`NLQK4(s<EUNhAQFu&sdJD&@4B4&shFY@?gn@UV-tbR*r(P
zdZh|izwEcXur2qP*Z3HYA{yab`YCAIa}%yY!)mP|8sYVA%sbvsdy9bXJ!Q`+HGWNy
z$NcE%!-mB;E^nh?y=ConL3F|8UPD^5M-jWbYchUqgxvycgMDOnHf6%lT<_~_L*`xD
zEGaypona#Vr(V7Mf$hVcx{lSGcXp8!p<uCTtk(9&__;owruxc8+RF7SVn;B8nyD>>
zI#g&qdAS)DYZ06LowR1T7~9%%*Sg&C+sI#}eWnzkSaLpp8cn!<H`$J|m09$KlLw~k
z`LVABf=a~7l&4@JZ#|5O9<WOC{XdgezcS`0-+(i^g|9+wyuMLB#;}%|_lhu$o7hU^
zW6a|%I&r^m;pLTIu-Z(>z+s)cs~5pc6IG3o<?L=HpMj2C&TYIYIG(dr8cwh~so(WW
z8O|o#rV2&0%<kDQ-mWdTNYBlXJM#dGWsI(eWGrTG?Hw$Zb{bs3E4Dee=pO698Qg_%
zaT6sGhTVm5uVyvj5necCu&*Q(!ePIqkNSjZlw<wMKSo)iQ_VQxKb*5&dIiXhFYFzi
z?TP1mg^>m^{#m=4@mf7MZEjKtTERpWEO`(o6nYPFu2)IwK;})uP@<{iHMzc$rXJVp
zKHG>H(<MR@3bi+}c6n%y3)TzFMV{K0+^PHc)rgw&yGj<q>|JXm6^}&$4RD0Wm*%B_
zYQx0_y5TSTKfnZf`A10}#0h9TL~w^f<Z2_XZ#y?#F6ALi()0Telij?@)DG0dwipF^
z=NU)TeTtzz(V42R7Na5`8zo3;VL~ZJO3m)x3dPPB^ieUn8}%^01Rl2j9VU4YCzMx$
z@q*`3b=f>OJuaiXg&y-ZG)XW~@2mXbUn6OuWTMAKEJDc-J;L<YG~Y)Cc+aaZ614#r
z`WR~OVYx(|_84k*T{YiDkFi@CXXHylab49LRx=Ej#d}!m>PK-qH0PPzYQ`tmx#@G`
zpWsE``kcd(2QA@yQoEn<)5sUqW%(%JpC^bZC495`n^WPqcyRcWYB(4V@In_Z^(i!Q
z@yRaISBHe7e9u!jvpOhhRvvUS;q;zCxT?oV^{9f-{~3g?M?~SCBX07`%CaBli2gFK
ze|$mI$6j=kf2PtXv5@GapF_X+im2cG9M}KZ$8Qi#8L#P7B5I0C+!(Ia3j{kn`>rGu
zlAOPMWotV;SI4!cZ^z?bh{i`>;G>lC@lPc?v8nz5*C_eSYrMFQFQNH0?iJB20<KZ|
zlh=52BP-pEWM@OF>w8HrtoCj5TXc`Zn|iPIG?rGJ|0}$C9$i{4S^R75R~X?Z!=3tw
zDAH@WTXQz0m}btduWDf^U*O162oD;F!d8avdfYBv=-;i12%qx8EOTDy!?CZifO9%&
zIQ?HI*zwN#-a2JE5@*<fh6+WnY{S28E!tgujSoHyGjt?7Eg=YlH}JjHUeYk!us8g3
z<tRf*C=6h<fA!)kczAo)n7#)!aukjKzCmD<D@-LjaR9%!7{Jd{)VRK5wYe2<A>1{x
zt|Sx&(C@qQ=PkV0S(HlecVGM^V=HFM4_nDj93bf(25{;oX|%bM-Q2ZhWqmTCv1B9+
z*421=G=C4#Z9*^OzhXs4ec$6#teo1c+-_1>AI&k?%ER!sM*i7h;vmURhtn*>HC?N=
zpJXgX>R*OP!-tE)hr`{CxgCE|@WHbJiLjU#_889#eL1fWs7JU>JEBSa08M(Ts4<-6
zZo)nI0B0?qMi8OFM+jrnd0`uF_(yC?JAJ!J8e6^#Dw=Bj8I3M?8Ax9g*W^xq#B215
zZ&8wkIG2EOczv>19Dd{yclrCB9bNkp{X<?~u}svbXS!Q(9V!s+?VE#$aB>BN^Rgu&
z!<AIv2-&hBQ4$IzER0Ur{}WrW5gs4qU$@a(`!A|bC}G#MadNv2mZuNLqVx~3Ys;Vc
z<2OlmdRd#@bvV1vh%nM7Mebl__isn0;sTHgC-Tn<J$8x?az3MCWACTS?cN+|v*IJ3
zDrWb>3ntkTt&=O-`G3KdwRzD@x!t}dmJ4z))_!YhH~5%j$8dj-xij4BFNoM6d!gh(
zESB|G<TT}MHT7=unl-ufuV{wX%QGdlkk~zrr{_na$7|w~19_Y)?}{Fbz9G&}8&}Hh
z#@rbk@DZ1y0DS*R{`$8wzN5}}GVFA@cg4u;v2Td;PV9QggBWK89M{NsC8~W&-3>X1
zRuFD@yO{{*e~0kV8&Np?Ew;|`?+|`4-zf=&xy&2%LYa%)W#J2C^7T6KgJfKb`~3ki
zW&D7LU8#E{4`NKGeqg9cU%8~8YyemPCq5<b`{IzK5k{HSdh*IdwBlU6|Bz?w&mZ2d
zJ7aY6m}DZl{rD4ZZ)<Y>eqm1!uEgG~+3&QZ5!|*p5teZVx8_n@<mHVP#fWx~DzVS(
z?tVeC6MfgKg6~uvQKPHtp~p?G!rTrP7fM25fZ69lbV{)7bCsx?tRsI}ipDy>(8-xA
ziX=N>yU2QAJ^cwDWai)w=r6Nlx!7N9C&t72fn*_!v_GkX^%e9~SM;#_;T<2KA_cyj
z>u;ps-K<BFoj8@m-<Zl)Pf_z*>7mQ{dwFPaulM7_OQT88iCXs$)FYIV+Em<N?|v?o
zghDv?b{Xx7M^+zT67thB)mNB!)?Y;Pd|jDjCPowR7yg6%|4*XB75jVWh-1|JuY!ml
z@#6SkL7eHWGGX>BD&>;Qkt*C*26CsgRKA%PRHlr+{vSyp1RB=%#{GuKn+L8#@=b2-
zP{E8_R6|vh>#L!%U>bPVaQ?qLff()@4ft=KBnfr1YN(8u^Fi8@P8h`GPv3xE_#kcV
zC%SOs(j;43W^;+I+|IwW(qa=bSlo^lwHfw#EtMbVrm3=I7M(Vf?EV#16LGn(7B!bN
z5qqx}rjka8$$MHu<~1f>_CFu>WUUkJnymLQ68Di(g{9oM&XC^Ut5DtsxTUK<OyF<n
zrCWI8NajkIwPYg3)UqaGYJ&|NwXhuN5|@}*{bONfl^xL+^7`Mo+!Sq9Fz2j|yoT;+
zBx!_Y61Fiee;>AmWSmg)4e>CSuC40LY*aLtEbPK?4&NBsC$}z+mIDZ09{9Zj{7AlB
zZ$@Rt)NSc3x0^D}(zF}4IV)SbAUhxC0&1&#>-I~eQ|jKxU2>wu56(H(!jx*B6b!g&
zwa}059^OQAoYxFIEo#hlP~PqtDoJC@`!V$1xo$EhQG)Wxdse|E>8PTZe%h@h3t`#|
z^gazagirCChA0$4vi;=MC4MA7E>u@#&iG6Xl-pH29qrix--pO(OGnbUC)z#JRn=oo
z^a+vdga}KE%=Z-_!t+R(eBaCG{A;TsnAS%+N)|$dr=I_J`V1_7!$SV@_R1?^fpdm>
zDr@GpvYXuQpwi{o6BIKXQOftdZ*N4q9eOG&=FP7t$xf`rd*B*AA4JWi4<7R8uOpjc
zh<>g<^a&NB{#%8Iv3MKw+Y~Peg{g1;VDr@%J$V2Z5qUt9xBvrHcP2_PM6wV=Ibi_T
z^=oiRhN?i$#t@x;Jz|8U5z_OviQn7nD7Q1N#q#vLs>$2+V7l%aC7FnB%MIbyT95y+
zAJ^Up-F7@=yrdD_u76ul6^dt$?Tu)#4~#^+5+ih=!_O4SPV{YK4BvAtd5sU(%EnWh
zOEyN_RrS&(p)kOY81El@Q65iRTjb}!9&5>1U%XYdygFMl7Dwq|f>F-fNkWF(W}>RY
zjLgW8gu*ECW<lORFv2GvI?9&DqVXa(PaW=C6NI?#)?&$693|BhqttMdghpaGVYb<l
zP#8s-FyQP{Y<MR((W}l3Z^>AfvuWX3i%YMg(qje{td@Hiv+VB8QdBVzmnHeJemz+9
z&?m%`;Vet>{gpswgJdi;R>$j0UihK0Hsc_XzZt2+^)pj-VM;@{NEYJEip?;y{7BLD
zYBLqXwKT`>Hh=97Nhr)rv*%{k7K`s1Vv|=%5hWU5GgsAR{IYgScH#hr78qa}ejUpS
zJLez^v>01^pQI6de>xWtX@Ibea1D}&?bTPbTV$!y;{I6R{7x8iL~iV5?9lWUf_iO9
z1KSiQ8Cx>#XPl7i#K4Ly5!l1QqUK~>RV~h~E{cC;{~1XrL^n$#p`3rl*^au(udhBM
zC1V?=rsqY;PAJClh)087VXIo+n4Tl6IXf#=XC|xj70E&jX@wO+dXXY}HB3jHe_A1=
ze(j2g(7hgn>1mSCh|@2}<<jBDZ6dtN3v*_O!p7EE8O`GE6HPyBXwJ+QHHWQHp|>+i
zB#n@Rz~JqQvn>^h3$N(mFlC8oXSCF_w(N0#(t~G`4{>($Y%tEYt75mf-NNkru6fG8
z!!tg$R5B8h*lVwCldiaTEsdkM`VCw_ee7xV1K&#);xH%cW0>C0#9=IK(aZx<D<q9D
zox&xlE8e1{ldx6F8?HsEXt&cA7or&>ze#qY@3*$_J@q}e-cA+5h1;PvTc=h@8X@8X
zsn!qv!-edu3;jfO1-H&l)rWb$L?OLE6rwp&=r_9^meyZH)IxUhmf#gx165ll<#bKS
zL=0qC0|YYh7ayz(SNY4+kaKB|2Ke5iwjdP#n2&IuXxR|Ly0y4|_7F~Qh~h5SH73qZ
z@S5y@8XbEy$+f3ul17+SOv@&UK&-M4II`s5ke-TfimAew+*MYRg)pn`_hvWu$E->)
zD{tAXY%RoS)>(Mza1$F~7hIsSm5jytMK?l7d+UnA&5aO}*U&~p^OV;ZS&JIS#;OL)
zqchDUjSxg!(Btx&PWb-HJ-P<d8Vhz=O}s2*ZJW`{S?=QITZ4(6G4lZYR+0Px@MWYp
z^uW$u=G@>Wn8${0EhJ-c9;=%mMy`h_wCw3+%2o10r(r%s=+YFz;8;;Op{c4q^YLP9
zqB+iM?!<|jTFr3Rc^A=^Xu_I7(|D+;$!ex*$ZXu)j%c3nn%%=ijoSz>`4^h*GvSh6
zNFs;d!;ZrNJqyQYD_JEcC5k@&Bzl=}eimrJIw6sAV|Ld8<3S$yC5vBl=kOgP8Xp_u
zRhRP(fbqb2-6Uf%eG45BsWv{E<h%F2gUW_6In+z75nj>PcZ9}fr>I%8)61A!^%D=K
z`$xtS;VxcSa7+~X9QVSv5}i;B&!7QB*xCug{PU7fmuqzat-6X==NBXpbrG-bQpl}$
zQMqx}&bUdIE*efWF~BvFZ;P7Wx6ua2oZ)Oo+-M@K;DuiAM4_LH%8p67I8oBr@)KU%
qX<<fFTxBC0C=^OrISROSE~;QAsxa;UEKIXpRR-EEetY4u>;C~F)&Z~p

delta 91922
zcmZU6by!qe_b@Ze3=`)}&>=80#1JAN5@LYeU?&L1RS^Y@YYUi|h{d)=y>^RwY_YLX
zuU+W13+%x7*4b<0dw=)y51;+aS$nOuSMTKR-A9sn_Z~^Kty@^j?5)Iq=jNX1I$Ikq
zOP_eW>+B}qWdWm)cO`c-m4S1M+AEwZ;gnYG7a!{y-_e}h+sVo*auAvQJ<KFRkUKwX
zxE5ZLq&aCthygWsCRUeNS<RFagQLWcl>MrqD$I0gD4}1&5BWE_4`18Ket$I-s@$GS
zw9Y`a!JB|Gv((CJnt}{+kkY=|!U*>usZ7LN*w?eZjZB0q-0T}9$KUP+woy>b33Fe6
zRmrTZKB<Xmn_A>dQ<zfSRfb?W5j(80RVSJ_W9vrSy%JtTe1uo5$N{Dr5Xlg=xv;4H
zdI>@d?6{e=JUTCyP{9-)?9u^;a%w~UkGViFK{t1hr=_q-xAij0$id-n%z+9ktI38+
zMij)~CY&n_O%7Ke-u`_o))cnE<_x<p)>c-{?8%^)ys5A##fdR|d*Dztl}W7|vL-`8
zCQaq!!5<R8xvfVlNQf%jP`G47rW9H77(EZ=_srC^=%?Dss!tH<Q3SG&RfqG1AIB&)
zNLDgsBNM{5=@vGe+cV~2i#+pG2hJ^cq5iFLE2?p7BJ@8TPQeS?-?oD`Q7D;JgK4AY
zyvo*~g~Qozua?zet)Rb}vDT>dsjT&Ui-EO%7Cw?wp-8?>cishpjpvADEoTCrTnYpB
zon?agdT1Ad4cZgfKrOejs-~OYCeTDmZNozgwI}TH<F^S#SAF1Pu8tVK1kkV53bW2c
zFj{*ru0gowB@Vq`0<EDO8Fy6z7Jpw?xb~`)fo;BZi&33^e=Pfrd;AA>wLQ6Zh#QdT
z!IMk#G^WCW7nP%XUG_JEoN00D&rSf+Ku5i$u>9jVIkmX>ZOKI=L8v99$779oZb8Lr
zYS$#YrzMc&DJ1&eN>Eyf2H&CE^H;sVLi4RfBM6+IAyJd_F5&9J+usLD&?Q>_e2J}H
zwItQ0^fzIUR5FG&-^wHzXe7l0=H=c4+N~wzZ7YWBFNNeSc3Eg6d4_OPNBkZ92x!{<
zi{=vdaJ6tjC3%CSgd&Y33!z_jZsyVhEV-eIv<VG1iG#2wyFJ1^g%!Lc)f$oO4U%xi
zPcu8oKGw8xltjp=B@6XjC1I`@fOJKYaH8`Hm;cAwgrXXfn~eO@2+09-P$4l&5^Rk~
z@@h#6F=7Z!>PkFpsPPLajU@jt*w#%XF;cA<=At&vKCua$>w<zRh-8?;b>xCixIwtr
zOtPCHS<q6_g7tjbN^%V?LO9Y^QcFPz?(QNvhu=v_S4k@*XX+-AGRl^ol1=D$!u3BS
zAC(k<l%z@8QgcsiR?GAk@NiO2W*^p=h;g~ZnPd+G4R;9)*9e*Ek}Zspkiilaqh4p2
zBpz{+ypfXjNb}i9Np1GsFH>?2LrlQ5k9{YDVzwwrR%a7KA6qTlMrb)+vJOQi;%Gp&
zO_SLCSJ<s5OI#Srfzu?+k%cX@BwgevY|VU04jOxu<?EpLKuE$Y0LI}(HdhbV3m+Cq
zejqc4_?%_qz};@C$RJ0#g@lBK`v@(TOQtg#w^m7hu;%S`5_=RNF|-d3M6nbJ5(TNf
zO`^sDH{T|ijEGm9xpA}!5KmK(9veN`C`D+&eWxUWNq?*$kz<S=;4^BpAMmk8Nsid_
z{|3LfUzA~o#Inu=EU+Q9TT*(`3Ec9RL`RCwNaX+WO)-0Ld)l{|rLwZ>IF%eZ$bscE
zIF<0UNU~T(b$obE&)R07*Fr->8if%6SAArs{go+E-+=qGwj{KtBrrcZ+$eOrD3Qr%
z28g`>bIV?E!~<X;5C26#F08yFxr8aqbzVxuJAmm)OkoD|Zyw|%q*l1CP<&PLomp`4
zb;(g?INJtGqe<~?i5`V{=e8sfW1LXufh3o~UwkMjML#7JzT~Au@n0j6Jdw;s8)!DH
zU9!Q(%IbqXskhe=Wg$S9+RPR7?<Fy)u%V~plV^a{4}w7%^-O92X+w%)!TnZ$lH{}S
z5c5@{!;I_HuC!_rNU;h>47s2+Dru1+@=v{RCt>|JNeUVdsn$eI73TOKT^&F4VeWLG
znvG^60ER<Ss-aQbkr6f^Kr3q+%}I8HaBpFSRBAxdm)6uxe*%%*(;n>$IywrUt)<qO
z(S@HXse)OVl9%pc4tqo=J<FPTw$dw1j+M;WyM_h--|Ff=?1=-k7d|>jH%O@=Oxsa6
zd=_w@!y%92XX4TnJnWsbbRi=&%T3x$#Wd<IwMJ=6wTnLWfSz*sZ~L<+vCSX|`1?u!
zL<!^!$xllKn4bS1%(DP#7Q>zzB)!0f(W4!hPj-uN85tcSRimR!4Urx}+Hs-M>i?fj
z{7-9QKjCPEv<5SbZ*`>Mm?Y5ILgK>h|6^t(yPot8l1Z*FJ&k078b}>*h?9z+MKuBq
z4c6ljMGT~WYp|vK#?p9Z*w0%?huUC%0rs0x9AtAl>1f1OwY`*MNu+isX}SWbZB3N6
zr1I}2i|YIfY^1I7w-g~;v=3Jb^}0))n1m9NrT?JK{Ly~fmIMILkczcMhzQA(v<{F#
z=k=8KVgB{AmvldC9{fYvLWU%%s}*z#SCE$dD`nfYzw{y-bgC45Qj9%<Ax*mYe|?%`
z@1KLECDf<`y65jW1uFfjpsAh|1WDy2xofyqa2+bW#88wClV-ADJ_xD>W;-J%NgZS~
zM@}I5K|`UQ)w_swE*X;mBK>=+G!x72g1gr~@*0S2RfwUpnNA4u6T?+PUY^tm%T8hA
zVrh5G@93Aty;4UKSHQ+4Bos(n@k|q&rO$9&3%5vrA%A4acIk5b9J52Z6kDY`q%W~`
zu~7N}yPVo7{VJu^RnyZGK81iQ$CC%yyqQpX`d;A%;<!f|O$Bp5>|<;+Ky<buZ9K*J
zFR}z63bJ^Qv?E1g*kU?!5MuI55-Nn41%P<L;->bXbQzM|c2HVfLWR_LM*V;r06j^K
zF<L|@SPo0yFr#)kChfqBm0XH&G5#5-<njF0fo76?NopjzQ&I)-=nJ|2!ZGPPboHK-
zTAj9mN?xc*i(3&+064Rn^7bY4ZW9+^--Xm`1OtSxadcP@dYx`R7c6e5HKx~$9a25<
zTyCuveq55SV`jGgiZt}BEFChOaBrD){4vbqp*>}K1uxb_TZT3~{RTKJBP)CqUZnh_
z)NyT2KN*W|(^F*<hA3}<Ov9SX2g<B55jj_+p4<hjJtlAMDrH|fP-a8kq{)s`CcSFD
zci0Z1D&WZWW@5dwS87A<4GgyzzNgFfGcx;!$Q&^&U2J*N$pGdz%t@TnSOmo|nLnD|
z?W#B5mI9+UY{=Vexp}TJSuXS*Bl}S4h*`29EV?A-$gZ;p5H(Xa1(U9or_uEp<SL7o
zI2E;!+#H#W@O`$}NmP{ZW1;LLhL-t*a_ip$WgH2o9zX%BEt9=rRfuuDYzAX{+a_5o
z7KJ4HsVtOixhPYTphB4%g<PvpHjP1^-YtW`CCYnIg1l7`WVOc}Iij@3DK+%dlg)pE
zA64BaJA)1`d^;q|V!3+kMcG>RtKZ+UJPcIAfm^cMta@5}U-p0nI$)#$^N^>qI7Z&(
zr7VVddcp@;E1XCQn?K2la5g9m`XXB?q4w}k`S98)qL|5ci^)v3adNHD`kQPd>vR6Q
zY^8=OR_*XjR|tw7;Vi~G3rV(>y9$St@?$8)JMW7}j|4mr1flkeAnCSpjZnhLb5YM{
zig#tE!RH|UU%fZ$<S@yl>Ik-#*I=`RzK(Jo)9fgBc}HY~NS2!e$u@8Kzw0wmzR%XT
zKJN+g_0Xcdi6G=hcDSR^*(mqN7*}19@=oOKsDiBUvzP>cf$~G_Ei|{tui~UFrNq<Y
z5Ad!W@}QbLSm+icAIZ%BMm70X)|?(9AIk#Qp+Hk0$&Qt)NNkv#*-BiP+?uD-s;H3e
z+XP1Su)1iy0m9T+`N7JvwuSr#MzVyVtFOmup{6E1Zps6Nu5t1<TGZv%1o>D-GNija
z3q2ye&D0ZL!K6c-i7CX_M2QPed&@OSO46M6<!K&}JZBL}1`2@#<pqr7#o=;$W)DxZ
z<(DvWlLIs4YiQ3^-kIjvAj1L;GP`J|d?hQjV&}*UD9q3aNn1j}QGM+w3`w3IE*ISA
z%ImNasry2CHJtkhc`M|%8R7Izaup*yW|#aJTGA0um&o&+m6cyDBscts{2qtn?s7q1
z!Mb+7DW8ebAZd|`5R&vvuKZ8UAOt^?E0~ySK9`p>od&&<pJnmj`&;>T<}=$r$!B9|
zkpBJps|MtFIGC6oDtt+N9t>r4h1|@nZiQ5Fh4C{&rx=TlCVX{NEN0qS;G>wx3JIgX
z;s<j&Q;?!Dqm>$}h{0dUmKur+IJWF?#S69=0|Tm0r8?sBNSl_nFtHg+SBOaVLa688
zMk>14VxPF?ii2!`x|WK>TKH>5j$$hN^~V&&_gXBX9ak)1zb-tf80ChrP7f99Fm?xC
z*yMI0%F5~@2~D>NCh-pyHl%O5jg#Q=NO3d@q3>6*Ntfuv+G9vJm&pf1t*jao37PGn
zGLpQ-z}hSin=_&4If<=pE--SzZEcz}a$^VDtYyFYrQ2v~BHGAxHfikF_3LfEgwnAm
zru1*q3&uVK$1arIwi$0rzXa4D6fgo@QLPXaR87czV>5$w_b9g+T8jeZ^tG&w0U$>i
z0$m7HwnNrCa&b4bwXLicTT3bHLhB%9P;KO7T#7PWB1?p|P_nPTGD<79A|e-7aTZgS
zowzP(BI{N`^oUGVnyo3IkeH^7mSc1A5T%zj?Ua4|kC$H|rmW;?+!0<6Q+hCX-6$ml
zb;D#_XJy){lBKN8)=3h^D-D%#c!Cm^IA|maouo`)20S}Q32RjJS29Bhl^ksjn57)W
znlEN6tFmb6ny+l`jt0=LP+6B{#|^ub$8qK6dSuLDC5TBbMGGtCRBEz*6O<!61f?yb
zUA#y6#gi7n@;kLYw}palbQmN<SXdLGH*F5*1;tHef`)?pb)|c`1x7wl^*@l1Eda9a
zkusDqc=w6&sV%Zy{8b4aLPv1=hq9|2d9_%pwy<AsDpV;r70&hTB=-VdIVp|PB6W>0
zMxzSHp}ama>E>qm9A{0Z;sSiWg!P%?Y?tqoAUI{)VSq2ys#Yo}vzoz2KiNX8xfF!M
zFWM3AAe?kiRpIF8#joG3x&|mE^4}EIq-7ywW^c2qlu1YFt-7p2w(kY0rm^OLNR<am
z(z&%%&Dpr_#H#w(QPd+|WS;B7TUoV<5(^L%r7*g^Y7md8%6q6rao9{tQ$?~oxIRM#
zNlQ$L`)(gS?+R9VRry=Yg62<EC^Mqi5h`073fOb)-Ap;?BUAA|!0B14Dh%VsajHc)
z5YlXds))J);U}twYR`a${xSP=yVuTER&|nL(rtv)I8h~cL9X^LRVC=K>9SeXnrUm`
zHdWPXsLrMLRL8w23k{yv9O@4VWVVt{6NEgex<0edYYO!Uds+$I8y#PNKge2RO-T^|
zQj&Xz!WF`FH}wfr@zX;Cj<g1trc#>Xge-S;DtbP7<DnKPUAlQe3PK<{-11ao?IkX6
z#~@dMM)fyLA;dpWos3;(2C8H6^QAy_0agX~cO)<I0j6`^iR81ZNdSzk*<?%H9wFKB
z>VNN#h&pL{IpE*{D1RLxV;6x=MG(?ET>Upv(RFj9-E2r&gxa1?2nHPW{OAq?x<K0G
zYrIH7eYKvnH~~|%k_a^{($iU9MWp&qto%2O8~HH`be>GMJ=J&#0Nov{NPV-Vf8GJY
zttC{)K$p@`le&%7uaG#bJG4N2W1Fahm}PEhp<adcS@v6t_iV6|)(TpO3V;GuXsKJ)
zZL5|t3xUKGNXomac`~LQh?cT9t(`j3hKAgKx*Ys70VI|RHQq}o|1;XDIpIfFbs9^1
z=}GFNI63~%XmH~t06AGFx~iG9?xS`ThWAjzCW=UMe1Eq$2f$xdFs&4QBe{p#N$Ar@
zeTo$mp##;&SO)(xSnb33H2tZb=|CBIaizSU8ql|Z`PeQ%FET<}U4W&MZ`0IlVryzw
zFy%6cY_0@dn~ti*EcHViRcM|%$(l~LTG@5nrUl7t759gNg>!S&UQ&a|C3OD%&y&o~
z)R_NUe}v?#yWr$jn7K;r&&05At-6a+HfRErmrmQ%hnOmE?o_v6^@HCrb$3>4Ogp1K
z#A0^K^XfR9T?osrsB5x-a{Gq5myJ$jC-IU~U40^~*u&C(k35)JmS51w$c1YV%Ey$d
zT@z86i|S~C*@WSI1I=U>!a6n5v}D|^ZKk=#Dwc7rG<z99-FVFiHBKA%b<tE9(WcNG
zE@*f<aHuWepJ)^0*C<VYq&IN1W&+M6^7rbWgn_ARNy$7-Fp0ef^KH+&P+CQf)igtp
zIpYs^t_vWcq+kW-OZ0bP1L1O}rYU2>BU=MY@zf><j?-kaX4!a6I}Gz)0|rS0p!*8Q
z>*A^+2!I#qXh{G2;W}Z#L`@CWr(}`_c1|d!dO4bfta)#Wrm{GhI!&_;e<gKhXdcq3
zkW}`(a2|~C5ZS&RxXZ}Vs7T1ea1|+f7%nF%Gd0UlDw<r)Q)<RHo*#}8po1$YL|m4p
z{3OrTs7cf;%@riC%hNbWR3ax~-IlluBz}&jH5ofwa~eN4pQG_)+zp(o>ByR==4n{@
zPnzayzEGRro|cuA4#Z|#iOW+yBzA$uo#YYCSVUx4pt;QA$<u`zw!TY7F4kP4_D<rK
zXf~qd|5&1_!|2pqrfJEXE4M(SW!6`)M$?p4udmi?{-LsIyKD8Wy+A#b=yQ2rV%!A8
zJvVADAn~^wH9V@w$*zxC)bMKQZ;@z8LK$eM`DRUfgy|WQ7V{KveTT}Dsgc6CkPlll
zZxQ43t(xYHv&bD9SWct5Tv4cL#F{U5f>q$J^>%B55wmLZQ}@GwIYC0#RX`sOBx#>U
zk^es2k(3CPLSMS)w{dyy(~Mzgmhaa@vgWM=zYWp<kftm9HRtee`#M~tF`!AGIeFsp
zRUml)1yl4n+>Znv)!awcw8u295iYUK?xF<%_nH{mfB*r`ooqR-*?@2fCo~yMdnG3|
z9az)9SkndL>4C5Q56hwVQVDK&!pI!S(i0lJaQ(DKg)p7>`n!n9DM5m(^S@z8_*qQ}
z(ovn$)MTS+d|oq}(K&HJ(~UK~OEf90Is1}^Z6gZ@E^D$`x7fcm5v)1ts-_=nKDnkz
zVBH$u(D<@nGj56`sOHLUX;`hc*n3*`F<=v%hHfQNHbZV}bVMVM&=T`q&3WovL)SNY
z{t)1<Gb>5Huc<<!?`b~MuPI4Ke8jI)(e(uQ>cB{iEY&13UT!_mc(Gq4|A3>>t~c9^
zI&lWNc0wb8o(7WqNb><1?fh6%pLJRGL?dU2);`s2wWi^bbbYB=jDhm&OHCjH^?R*R
z8K|319@+O^kP_BG9K;;yLppO22I0@2nn*S|-lpJSBbmzNhlVTn#sceY2&cvM^!H=0
z`1}P6-6^hEC4qQkBvl)s7HUUuTUb(jQ;XwS*7#M2+m7@9!Pe)m?uX)GDqS$6(MU$V
z=ps0wtTA_jh5S9uxe2T=jA_Mf)*vV4ow;N?Dx5IErTDvtmDRkVI2RJh=i5fu3iAhW
z3QvSyKAoG-$}XB9V&_F@c{p$KfAd2yF&^erWOqJS9V5`IeC{3{ExACr6|`<lukdz=
z0ae6Ukrg9hA3x7N!kZ-5<Mrh70<IYXdoJX5p^$DY<oqRSu?`D}>{SY@CW#IB>O{YY
zQxla_gih$Vm{X%GY;l(FYyeT{4kQXV$3j9DbG)!*373rO+P#dM&crvXfRnN9X0n3Y
zi`|D9>_4}J?#|XEw2Z4pg4c33WS}cZa``H*2f{>;Y~POn>o=@O_S@>F{0*FzgskNh
zBAgrPzM742+X@HubO1YQMV<2hfZ37@7da=Ax`vxV3laIvgpT<D8$`y{=7UA5dr)#O
z*K)tG^=2LS4~5+^|B@z%!q#hSG6UK+^yfaEZiiKapR0)}ES${Uz{O&Xvty4j5`K@w
z-yJq`&FJsdceI_@5Pt7K7#e%BVk3te&gAPx?g5g$w~5QaR?22>Hnx6l<_NYHZsFEZ
zZ1-pR&5H$WvH71}p(xwRrBZ_-_qTHSO0jN;^9z|wG7odnMA*h9V{f<Z+*E9>+s@@-
ztM(3V47LvJ;HG1%MIkqeBI!Bvusbmp0g_zb2o=%q<T@k7fSufA%17#xn_FuDA2kRB
z>0na29n?}|7Z;CUIlH*=*y49{e^Hdajy?8}1c9qfKJMm{@MC9zV;d6{0ylx;J!ux=
zC|+@VAagrXw1;bn;QGDXC~W2K<+3SOSNoF%Wcfa>7HPAOYm6Tk@8c#>T=jF#gj;||
z3+W+%rt*SdD5?9oPy{R3&rQUZ-vMqa?Xc4P`4)Dds(pWe!^IU6a*(mt^4!!R;eaxn
zDwC8R;Oxn#gIps-6my8%f~~iQxJl?+qYiVp*FmZjRgU&>5x12Rob2K_08NY%+;)VE
zp~Jem=w@%?xgF#mdX&Q{2gyFl<<XCeZqAAz7f--POL&Bi^f<=VqrcDha;ZyVcfjwD
zj&XHqq`Xu1*Dh+SR%H8AjStDI5uqnB5g>=O<6IQ&^>zBVb4cphajpgZX)jk}DcK}a
zkB^K{lCdYa<~XXeC%9Z}C7$FqU`um~JBO{?r#LqCU0%#Bz~5?}=H6i|^bA)*$9!N-
zgOvk7goUJs7A&WAtq3Rby%x|(ILi&AT#r7~>b|%Gk^<>#249UB&vP8P+ljX!rgPje
zicWZTj@yH4Zj<64UdaOJZDhL?@KNB_bt4oc%N54?`2y3eu=v@Y2>`ZBEDrt$rYGIk
za6EZ@k;_KJDJ5JrmXQ`;;uJ_@`ey63XQA^T@?aqsOti5PcBK3=cNM#wzQV~7E;_5>
ztcw7*^f#QE*k9!yBHXvDoHs_c>6be6SqE_KSREk-Pm+3rb0&kYa|fvqzW=@EQxSmV
z{0HPU;!3#5Rn`;xZ*ew=HhpU7z@5<N4<<O>T~0$(cepdy=lUH^$71q}dt5!1SA$Er
zUJ^QymP#|~h{5rdxbqZD4nKt6Hy>~t=*Q;Q*4NqyAAQ6FS5Tdbi3cDMHJSMjhm4WT
zN8AHCfH?k<Q4bN`2Flb%V0&VZ{HL5uSpJyvVG=HW!gWH)wQ^pxatxsVi`4VyLrLN@
zZV_dgR6OI}VC(gBt{8o4?+fl0wWTY~#<c7Jy%NQk8%!>~<OU*a>?`gl<)YS<(7s|J
ztdY_=IVorr;YnJ*=HAk;*G3E)BHGSHEAn<B@E8&o;Y$|1sf?DEGVUv-bLXi{A;$2=
z7<J!pE~H;MhufT%tCtUshHo##z0zP(5*NV{#anJJGO`p}6vfu^%snF52eiMCuOQq*
zZ6bIw@m(c3!+Y*NWvgaj?&fTO)xo@4%-Q-j5iX>lU4)#}`M?z*;s+nN7udS^k-Ltq
z4WGE}*h>1$t-+S+3-=Op`<E{qOxWmpL&`S}rg5~nqJq<-3T7Rhy|Fb=Z7qiM81myg
z*Mo9n>}HqR27YftXGPUWtOSaO4L`VEs0_nTZXJ?3_>-%^)`4H#R@$1M&d<l6loHkw
zR!DejBu>^z`6Kvwpf%4@fnLV5&~iw|&%m#7avpc?iAlku>RR0!pDG%wEtZ(ZqzDta
zmIUE#sDd|;2{!yJD)y6O@0}kCU<Ih1<m3of(oe}}BHkBD9)p*#PsRJ87&b)B<YtJ7
zNo!9&npEi-;YeaMd_Io$riOn`>#Cpf4;NPf*i%+JXS@OLZP5z?vNO+bM)aq7o@Lg~
zT7Ej^XUy3q-@8M%*`l3ClK>q*1YuX`_`6ixsy_#<6;C$4v?4tk@HI%09%$1QfvyI|
zEeWvY@8NHp9e)E`&+PcKltlUDS;Zp&!7DK!Mw4&${2YYHbKr}ymEg#;0R6?0Urc-L
zUggul2YUHnuUk%h48poQ^T#QKdHDC8V&L9LLJxqmQ498SsTAb~7k(Ue2zKSO=}0=X
zThWxJ(@O6e9N|EI4FEI#?#kCe;5u$RZff_7uq^olu&=Tb5AlI34hFfmNQXe1ScOkV
z;F2o*Ly9?l@U4%10eHv%Fgub8_eu`3J$OtLq=_dJl;CgtZFM0go;i%h%;Wal?H|kX
z=K?|(IMS)*YmzT!9_y{?jpM3|+V~(I6bL6JevpNpd-3y-p>f{)K5WJK@Sm|2?aTkb
zRxJyUUO?Vhc--PUIDLv+Q$W|0`BH{IuO&D9_yp_`<<B$szT?lI$KN&w@DHi{oj=Z;
zBeL>DOq|iAJb>rPKY{!-1RGS9|6vf##Ch$nh;aaB3$eE|srQbHAO$tRB@Tt~3UV%p
zUr6DFlxjRw<<zbQ1@j;A$Nm6=UQpV1*jF(!sz`cAh-$H!5kcfu5br|Xh4LBLtxt9S
z7q(Ku_;N&$Sc8`!0`iw+_b5Q{j?`NK2tvk0IFlXWd>+-`*TY>Wi^1@t6}hy%IxO;z
zjqoA)5tW8oHIjdTm<>_PE5f}y&lHWi42yj1F?vFThP0~5k5`G~m{oN7d?rZ`G5L~^
zNiadsP6P`Ij^?Kzs{PUYV=BkIphhdikno&_g#3vxBW^JXg7&9c%$9`_wJT*KWY^(C
zaX`IeuRakcQceD4{&_LHjC`-lw?Y!FVt5Sp%CztA|K&^orX<0!d?dz@{;~W@Z0YOq
zb0|-_dGe0p@Ro|HH=I1G$FnKG$ol*(6m6#lJS<~Tg^g;+@1vF4?I*n&lgx&EeUjOT
zA4)Nub1k?)3up{@8I8bTPdDauq+w&eK6anhm?so=`S=|f2n)s?L3%ggJ0QftCOnqV
z9kx%{B_0F{`4_hDO?VB-Ys%yH>P73s_rU-WTxroF!T3B#Y(UU~L^k97D6J0*o-L>j
zaMdyYQN94ofsAef##!8q52PbH@K4|V<VX{~E~(j^Z-CrnH-|hR&Pj>B1;3a6CY*1<
zb4(g{TUI*Uy;hY0a!XvL`((xQhbaRo%G^t-z`%N}ZerVj`;3_nb?2<s{GUij*@nM_
zt-EdbgGgvuTORgysMu?_=fB~P(H$ULV6#t0-kUXZJMpksK|v38<}vG%O<gLTYjsz?
z8`fzfuiSfk9#~0b`MsnoUzLO`0bY6~u+SvbP2?*#FH*Ymw^^~)G?~AN=IA-*`Pv-7
zu?NfO>|{_%zn=VRq;KfO-=m9)s~^UH5DjZBj{0YBu*9(5kQ_yM+QwaO{Qy1=B6)|Q
zri<;xtA$^^c{LIvd>{T9&Mv?9;jdB0J-;d7?KSAJm{pkZE1`Nf_v1_H*G*S~A8&`R
z8u}Gxg=@eJvQ|OBZ<E5b@L<Q{$2$Q;E3V~*{Dv@))c*V-lxR>Y--U^B+5nzSo~jN6
zUZ`lD)A%)%`{IG#&pH9BKE#y9*CWYmfQ})Z??JWQ|CZ+qG0zUfk45P`R)#`o1|P-r
zHE<B$5QS-4JzXX)y-pz^WsutR8zb!Uhwv9@qQA5J^a2UQQeW6H6^pUb!Mss$A6n^*
zLhPS>8-{THFup2k*s)&Yx>o?<G_2l*^5HzsfFvXN25fjuNAbl}aLcxJN$d~c>#WG(
z=5DZAlg&Gl?^{5&9-AX<N#<yND1}Rlh>IKxaB8+j`9HY0G5jnP--9tcJMHTg-X>n$
zo=LMN869CBl|70#l9rkLa};FFEFPD=-}dOSg;uw;;;2US<G^vpXY*|l?nXBA1+s7)
z{|J8@H=h5B<T5AluW@yUq)!4Ri*{wVb$d5)9ep0g{_>p>YBF>(KNp=?lf&OZEH85)
zgHXqQGll<$O1oN(qZ`Hb{y>R%t{momyCIVMO@lH~gcDv*<2&MHWx&S7r;`A1r>Ni%
zl0Bo+^!;a6MogPr-ku5Fa~5a@iN@zuI%Lz?ze|GnIeZyqJv7r2ElT(-`b+j~UQM$1
zLz<a2m(M|7UFY#QBN4dyU_g}Zf3A*57rpTSOK5aLm$3ly<?4KXDb?NUAKu}U0mK2E
zL%iSdKalc+;3j_&J`eFnE#RNh9uW?$w~Hmw5nTWJ-9tu_7V<OcXx5f5<k5sT^gF+>
z2!Nf%eV<@bbunLxONsg={0e8f@HmWnCH2w3@iHUM$YPd2MUZd=Rvy3a<b!dYQaqy`
zC{mJ>(j!n!rx)?K4k?T+;`MC1VD?eI8rr#R{N;j<u(4Ml!v*pGb)0uB!cFiv&L>Oh
zSO#Wn+&l&V{S@TE9oYGy$N|PlxOtMF!)B%Fr};H(d(x$Z->slrAE@hO+6C*LgWYhU
zv2-UdCudHG*st@4)o6te9zYR*&Au=B6C5@}zVHo|w7F}9_B3wNs>23(j|VwzW_vuP
zryy*Mlxf#dHY=WY*mxF}mNz()BWYS+lKce{d_^&cC{?b#s-z5k9?*2H5?}^+i6-hp
zL=3_Rtu|DNHoDnS8;Dzv<c_<REr}F(Xirj3NPkc5UzAp>vSD>w!D4c`N<5>*7L)zY
zMQ}nZqqY-UOkU@u{f<Kr)>YN+#T#uC>z%SXZv%VSM%-x5i`0Iq-0Y0e4rE6jDq3hW
zv?z%5uG%t`rtFvV>+f(_W2!Go6S{J6vQ(=OmiN%w^Jp?cnzo89_Pn00ozH&F$kiUf
zz2%D=N0ik9l^7WP<hk13%u0%vYW-d5@N`oIYcZ;Hp;5($6kLWvtNI>oPei<6j~325
zh!I_QwO_lNr=k^7CM|LU#Lr!DwoX^(5C09iE<dgHVY+ttTf3h%Z`{+G*`DjqhuTGQ
znGZV=6i7<#^$K$SnU?W&`<XVKS#R7+Eu4!HA$ON)Ra(q&{15F^Y%Ths?S*2H{M25Q
z(k$AXo95;Ls+vbG1?c<)xkMMj&T7<@>EKcn-D@x^bi3KFw^TYKqxMsygL@4WK&jJB
zs`Ma(t_y3vvD4+UgB;n;x+&~e&P|tNjTzkh*;c&L`GpvCK)cvOSB=4ZGwNWAf}+-V
z>0U85D!g^aq*R)R(z>`Fgq}r8G_dRkU|U};I=E3mI~51$D$nyB2+}QOzb1z0)?$+=
z!*qD!gTPsTDpCzOAEx{F89|DOgy|G+h;v1pZkhu&gL~_Sv7@|6OAHa@et(@aeyQlM
zd+8wSE9wfa2nCYlfJnW#yI~LJVr3<w6z?!ZlLA@f|1Phw4_V`MZRqGC%KcAX083sF
zi+2f(FC(f5ZsT<iI7(#qEZsLYqMx&M%IZj2eO%WNZFs@V&ckAKaK0gd9bkv<B=va&
zFRUrn#pw|I%XMA9+O#QNOba9>{(2WD#;B4qdOqxGgAou@|EeXPaDn5q(FQficpV`V
zzS!%hv%Q}jquy0Tna*22&@{$kW!3jPF?G;K2~Yj?b1g_KvyHwUYMe?z{7+ofGK-e%
zM9R|i@=6gQWx=U~ek4b+dV1)4?*>sAYmhcYo{Wi2Wf6|T^B(#ll!vf6O&=CY!T)(y
zJ|r9LqBpy{MG4$0hfTD|0{t|mdr(=7U4sq{>;3D74OO<&Dt)F3Wdgd0Az2@xZY|xU
zXO8w}lb&<L5&b->heLX_)45an!;P>xi#J5W(LFd6D4aJKnwT&KIENXE@f?ozo-L2V
z!Pkbu()LEr!2I_Rd;=p5EzyZk55TcKY5fHv_u*&*G9#ReHuOXc_6g(be0PB@uREfU
z!$m;7Fs`maffpl%>75OStYsa=8){Um6x)mh0|rY%M|C&BfCnYZ^-UhQLitxHh0Q{^
z=s<ztt5S$gG_bP}L=+Y&sECl;{qH)GIPNAtUGMplDy#Hh%M}p*dau&k3SLPD4<<l*
zFE68?!4HM7q@SS>H&Cmziis1c9+0AS0*r^Sr2lV%l<(F949@?$o+T0-^TPW}Hkjk2
zhG>rZiU^JH#{fe<v&3J+4JiiH!+@!VGCV&hvLA;(ZeL<>Vzz&7qhW`f+KX$-**Rrk
zFB&*s4|o0M>myyrT}h-)xVqi&RZ0)e)b3}wJP(TTc8{cU-|jVvAcEsA!-h^2LG0i<
zmtqZY!LOLCXy*_@{hE|7F~|iaZ`%p^K~=;C*#37&tFo&vwSgfyMB<Q&<dIt9CW}<<
z_Ovx<DE_(AI`43Yj6Bd)JnaxpR?8yoguVf`7EIXA{ZpowSA_$L6=)AN<&k>fL3P`2
zRdLYwlWb?8o&ILRfo$z<%ko}PZ(BGzNX@vSkL@JPRDrFEl}3<B6L!}N7<V8xN<jSg
zP}_ii`4CB-WUKhUF~hgpvo}$YzGd#z!bwP9TMhA4McNDNhTFmgeM;(0rY#&YqD|8z
z+uoR_gsF3F?U{G2SZ|xme1cl6k&jf8mYZ#vA9merTa^u9+CEzw3_p*SYi<p3f=hUL
zRFgu{G27KnNF?R9Z5)H&|ID^PkH02;wJk>>F6^MG(n4-!)!$9L2}S)tC(M-E^+Fp7
zzLY)E7rb_dnp~>o6hg}NkqV)P(yltoef>3d&)pGcCx5$njLEVvI}Hm@KV$6NnB}?0
z+ri@mbm(#I?KmYiyC>KMvTif`*v<B(nm>7C$&F9I)l^$z=x3ynPc9_Pw9_%srO&l{
z!uoevV%G>yNd~W4tn>$SiKXRB2)XDGDJL%-BQ?USZFXTe+*i+c_3Q?Sd+2buMGl~z
zo_p=maJY$G2S1+<r|Y&lh=mHG5bO@y6)?KO3A<=l%HfuAJ%+mKAweY3f=e)#+BIat
z?N@HMh3VDxm)$}Z#{{K)O|-Emdu!ZKf{op=###-!>V;rOdo!kY^(@(`IS}<H2N6@T
zeTXp2*?tA)gTjgZ%8vkdmmJBF6Luj&nveZ8##qy;_O(#9zhj?{Bsa2Gku#z8OuRQj
z?OE-;<E)o=7?3DxL+wQjeh4jGjk0fr#OBSvGGrx4GZ$tR5U<4yC>H?ZNhbGzIQuM?
z1va;}Kgp_;;qC2xm?0PRx4+HQot0s4aG+xA*Pxok%3WOEAf{FN5aGfkdv|6(Wh?E=
z(422<)eJuf<>EZplxhSQp(uMZy&`$Rf3y85HOl{<b)T~`&=gN*hk}%<?Xkbd*wG)f
zw`OtCso1^(6+djfceVt=jiD;Sx4vX=uaUWn>8jry`yGC?Y!N)<4xgF6TyzdGE@+|&
zeh!oDY0=hTu1Cjv;0`xHwi>gbuj3$NM4jt7_@fQ|y~%U=FA#3~>Zqlx#txMQIBDlF
zfpO-M<S>Fo>_r(4F5XB<nCB4A<XgVPp*ofjRHOM_99{lXC=5UD5Z(;?ZQ~s)?6JAk
z?0ASd+@R`?ML2qoNwMAMKxlerM;B&=<XVmwY^WRup6IvHAI@6eX-`5Yx&)G<yN+5C
z6clMg^udt|(mf<nDNJkN7{+4Ak=BlShVntuvh`pe7aTD7Cm(lElj)(LB;PKMg=`Ra
zdpLe!%-kI0_<;R-Xq00gCI-LBjw6tfZpFH}yTG{ZoiQOB!y~<fcQYM}nS~#k=Xi~o
z(}G2g@0sjoZF2mJMViRHj^3=9bi(mDQ)A1Ej<uLGPQU9YWgN_Y>^PC(b$R8u3+GVP
z77yQ916;bIIUeB2wstZI+V_qP^i)#u^#;wbwS%?XM$9j*olJQ0XX=NfA7g=aXXcl0
z9G&JM%AsG}hPQ-kN>l5T6=Axn#5hhT6ArmMC2OfU%{Rw<?g$;>1YAEOEEU#{v=O>j
zbsET6yI9@n17rA9EvKf4(x+Z7mk20HG)9lux{*eqdwr+AY?P;4J56M1txY?p?kL<f
zc8y!S079<Lq=&y<2ze6|2~&oyP9GShw>_K|u_!TXfKyYZoZG{l;#srW1gDw|Hg&4g
zV=Ty`4;`&>0&4A(4bjqbXFA<t+8(gfsVnPxW}TC*hb(X^R{zXkgHAblvT)h@x>KGb
zmGIuDzxK_50&imnjK0eSXSv|@#%YBM?JgACI43Z*`Wc*CFq0ha;k=AV$;IOQk<*5W
zhx1<??VA{3WmS3&Lr9+BY%jE_;hY|!y#c3-H@@%S5EE!+)qVH>K9iDWVC$Mx=VpQE
zNuBeZkFqT4vBfzFhfl`sbY7}G3OJ@6h`M$UYPnE&v4>u|7eN2tT0}YuBX>E&Qz2ra
zo7C}}aU6vD1#+=O2@zmK1sLU>;@^G01Mz65KMjP0j&=M#=MPM3qfR=*V-7S~W|cS(
zHd1tLr(5s53PG=cr)&$xx6TzT{&;?JwsoOEw=VsN>;}&++^vR@E?wu6hEl^@#zpNS
z9shM2o014|agm})9?MGn;}%>k8byp9^i@el`$&##iFV<H2^N<p>|V;iXqQW9jQ6U1
za4L4Nvf9^$T-piue?;75V_OIUdG%Z_W60m+`K4hXV03PPH7~%JNS$jOCmibJ62VY@
zNpWe6fxT|x-QclcRD2XY9fl~!w1>iT$xxRZW?q(wE=!p?7Eg1TjO$=k&V770!V${R
zwpdY8-s^OP{J(6K%Xl=gNi`OY6$^{{TFh#dFe)K!pNpEI98=`7&y2E<{>SA84mG-N
z=^7Up>W<Dh8KXUycZTvI^@WQU&W&+CUE<?vM@oLVF!kO0<)UO$=NI;_@D!6Mam>;b
zGQJy>DFL3Y3_QxyH3ca?o7ivg8sODWizNm?mtcwsby+g94n(R?tuY2rJQ)cf_Od|N
zCd{v*Yr9IAi^Mc@&1dLuwsVa{LD#Rbb@B-~uNX5~^tM1!&^^*Y=-%D+q6ZE=bgb)g
z4D|_48z#Jj?2+Y(g3V5hg#E*@u1djmvTFq^Y+Z6)M>98Eknj4A`B3gr*F^Sf*%?<h
zMzll7>S|JY+m$16S1SjfaMiU1Q^LmEuGLxd!2?%YCg3fns=|T=tfl|gg4=v{J&dug
zZnv2mO;9}?brrAELiGU9Lh>)ya3+C;Dz_ahPsBUAmDy6KdG46qw>|{-glKWW6RMrw
zkxoLczgssu6vpe?ZpWE0cQtoQK?w$LZa!fGC}^l2A6gQZ_={3tu8`TmZ6>S6bluz<
zBZ@BF+~lYlT#TjTKc5<@`R`D+rMj(02F2@}pqLGA{O|c7E|>gXO0c4DPf-wZdqicW
z{bq!lK}*M#P$JWZK*UJr#Sk4RMCG_O$AyxH@*zi-f%>yD#V`>l?4Rzo+k^JmRO?-p
z9#Fuf*P(sL>ogeBz71}vOltlD@PlKDzUbB$l|<k%2v|tD>Sm&uEzEDwiFS~1UXjpa
zYIr2zn%j5=M^}dE*b4@O;g{ZVV|saW$4!d%7CAUW^9HJ{9vV!<mC&3J{@Cpcn|Xvj
zcY|9aG<tP^?N*NSRrBofdV9ZHS=IB^6VoO80J3c;B)YlfZYRFesW?e^UF9uyYV)Sb
za5m36_qIwr{^<FxN|+s8=3JlH(zhcdkhkT(OJp@k8xE3KEO$?fKybdcdk~txIM)s*
zzlKNZ{;SQr2JWNLh<;b*YEm#VQum)~F1eHYMYP5LDmVL2cUzLt-<`=Rv%mWlw7=Jb
z59}3h9188k5)9sT84FXz_|XuO*8J%XR}Mwtd>-&~<pc;@GoG+eF&fH+HsjpkI*|Ce
zTR;2xRl$Vs*y1onT7oBWZ^=rn^d;^+F&f=mWmVh-+^Qn@H?b;$?<)6RjIEiQ-KVln
zu7&Q0nU?PEcF$mnr?y<qd*F>2aZQjdc#(tdvzR>vTysxm@g(=I`$(33H6Pt~V~AgO
z-Tgr;2z;S6F#!U)AuQbW_3&8Ae2(g(G}Xg~#7z_@&)y!4$FANU4_V<D8{zQ>1}5~z
z<Qb6;|5e@Rx*iQ!I<DK?Bc7EWv)g&JWB8sWddy)1&Ftaf&wkyQ>d_80*<$EyMKPq}
z4lcNs2}ImT?$k)N&^FTpE=E%8^_$}Hol)_d=W!BeUv{|}lUst{&bJlwR4B=q5vd{Z
z(<8M);}sqqncWwx_IQI@9``J;*JDuQHb<;QfKXN8`F4*orja{>$2+Xq28}sot_~Fm
zyfX+BFtC}Kk<NnW0gu|OW=uNdk;%sKpxC2D4dgN8hsO^l=TNO@t_}5Z<<%>lsz6#;
zPfYhct5X&Qm7`}cQ($d(&kBaLmeI3P1&INk16ek_5aZbmZ9@oZ?D?E^bB*)#!3ZUr
z)Ns^v!0+RNfn1cHFtx4cYK-&8Z!KLaKB=_Pl}KVdB1q|K4}~zItEY(}TH3?2n3W4D
zgFHDV?UAEAOPJp2;&+C`Xh)_`@ML3NIKgwhnR<rjr|VM%(7;7_HYdg-lw?bc8ezs7
z&r?kJoi=(_ZjKz@<vEe{Z&KuWoQ+9$%JV5B`mMy1XIyD+c&b>_`M#$Us;||Z&NHik
zZ~2(e3xO$jVe|{nP=<Be2hZ7P7xjkxy50<CT|?}|f)XD8QXB1rnG$1HmSkV5j7&UZ
zJDqwG3?|V`BU(@$;kLop7^{bku>QOb1eaz<tx{<1WW3H;3iLEyU{U@@pz$t?kClSb
z6PFMp%Yp$R#<N%`U^WMqH6%FR$aLE@-Utsei1v-{RWQ)#K<*|Q*??Xq8sSnMHUC;Y
zjm6ALZTcDeBa4gY@Atd~%-^!d(g(WBSQw$s9X!;iBd>;3qWd<)sA8|<!J{65`RiaQ
zF*djIYh14JTNn~+dDIxsb~KjXHI8JXuJYJe-$0c<I^a^p4u~%u>e3)5kWa=NEN4@?
zbvH)pey3s#Uu~YbsfWTM=#(U^C;`M%a8a5vS)f_$VyfKOIqznIgSvForS7H>oCs93
z@~CY9J3FQnr<E3=k<sKYqn9X+kA4mN0InJWdDNRc7^m|VR(YEwEa0rOn51l@ZA*X&
z4$V+u!_yjdX{O7sYQmJaJILh1cFWzXnVea(X0R!UHT#E{0wwgMm89v9n#JNttG1*(
z)N~y=JzL%MmO&QP5b4mpy^?T~8Mli6RG7z!XDrIu7J2N>$RH9IX(~oQ*C>-)CC!>9
z57z7!ZGww*6w{Jgrg+vYt8KDFq~F)gxq29oeyco4*4zZw74|^cxqDBfk@Tu-I*q+U
zVoXk~cZ*mPJFb2HSUc}2(0hrv*>5J<V&|lNu)4jXo(a!ob=Hnb+W`=t#q<7wB%^`J
zh77K6!mV7Pb_0_Jk&+e-P3)L*okk`+2`EH0Ho>J|dT6C?6B9ctSFb6Pw@|N{Nspt@
zU3)YA5K#F<&pyI&xaKByAa7&~lLhzp1CG6{Azn|87SHNXUj4}DmZq2Vv`av7kN8sf
zSnppSYsQ(%aO_p%P5x}`U0R#iAu?f18<PUbbA`R8?1tVIa775-Zb3QhYil}#-KVrO
zRh|^gYj1jw1Ip=O!XtlVbVn1OiXtOBnZ96`KRcT$4@E8QVv1(OZ+8V9P=5cMx}jhz
zP<%rS-X3NWayrtUgeRET0ie$bzvcEL(S(O7g`eF`HVls{*~AXh5Z4~2S9oed-P80M
zS*z%2dQYkJ**j<37NGKgWH?CSg7i6<bB6Uc?Li*PdYj-n8#OIuAFyoN<wMq;XL~4P
zZAO7bKo>PRbOEBosJ^CB1n%C?RC&K|af&Gk{gnJnHQ`2~P<Mc7E%UeM=_Usm#T(LU
znmi6FRd1s2Z1xvw3^l>pJ*B;1xM?ffyKj+YTFK+DZ)cc%*{^Q%Ob=OlkSsI(RMV8Y
z-%)o+e9p8!IkLeND%4qT8qOA0_iQl1ty_wEz$R1W)00!TnK~&bCvOKm8<Gi}Tt-eD
z1ydg5WWzp_iEXA|JY>pb<lCGy)xr06B%eE6O@<KD)=qqZz>gH%h?EPbub5yJijs@I
zW!j1BvIEX8G<F8=s^b#_B=!TiO_iHa&QyD3>WPxh&!3av1wl)Sk0Mc!MK@t)dgZYx
z2qWJ4@oP%D0IOyjEbl3d+b!65YV^$1i*dc_<!`Ppm6>8u+~!_?PP+;{r;tlBb5)`F
z2h$^V5q<iTgB9>$4}R$T#iYg_?e=?*ErAbX@xzAiru#@>*GzLHlS;A7?1#`RyOiW8
zAQm4Lp{ogjDzhDyBCoyMOu7Rj9|-S<iSL|HmNV}G%M*C>6Q-hpI&(FKa=D#32?yIR
zV@k_T;wll7S*e>@Nq*dqR0(w*%{@?N+DQ{noB<}=h-CeRt4`)Sc&KHF!S$dusAi@;
zRgLh}&Fp|$*=;uOVv^eHV}^VCqN*NhhrIQN#QxX`sz4*meBI1y!6v}W-XJKq-*clJ
z=5?n!BetQTW+}7FQFYDp@TH{L15eLMfRgo;1+C%kKd>m)vXQwDnn2aO%O9VDgxtVb
z7zw5DJkI<X1?YOW{NQaEi4%^*vAy{<<I<+Hc{OWp>t^1>tT4Hkc`ai>Gss+r{n~Pb
z`5C<DI_E@JA!@vN06T`ZZi4wM59{^d#_S(i;h}?D6qC%#|K#MbT(_O^AoP0pa*F^U
zX%cOR#N%B{FU_lH3g&5|5C36jvgA3ee4LqVZp)IhajJQq1fDvD;$_b?^JC1cE2f(V
zQ4~|9m+Q<1DVICr7LrJxgv>Lm$>x`^*3^B5c`>Uie#|w)=?xK&@Or*khi0;yn6uC{
zB7UqYVz;=&hq>uM$c5Jsnu->hqp;uig=Tvhy#xQb*T5-rNRk0=K*uaIACi?%0`}l(
zuV_ruG0V+IRa)`Kpwaw6cQBfkPW0UtA!NO|wn7#szNwJ0)jX9IXg`jdcWP+lSQuO9
z_!kh?5&FotP+DSkW#_hjUN)C8oxQzge#PR!gS+MuoJ%~b)z#@R%s-QBhzZ7*+<y=A
zllKqIr`dF-=(X7vXY{!*^Aqd9E`z2qx#Z$yA$LAR8U*K0<|Y_|0>6(e=?hvp0?VW!
zzP=>sBh1>DTYC*cL0!4z7hwj8!?8R1`L6i6wZf|^lXEuj)qqv4r(C=qA)jQc(d!Z&
z;AYRJ?}mWM%!RY5MmTrzz^ovpMlTz3z~q&QF|MxJtFp3*^YXgIj#pK!?lsF=_BY5>
zI2`U3rN?t%6^*>sF*je^+-u(NE5TmN@fxzEd!vk0@Byis9Qo?)OWssKm@u{T`jd%r
zQAaN|yNtZ9o0qq>te<EgCEdN+;zYY|8;^rqp!V%tg{;W)@y$>2l9N|IV8ng@@Y;hU
z8s!e2lLsXJfs-C0iT*!<M9);OMMz?Ix>qk&UH2X01&;;Ok~3?R*H6axxGXRBz_AcC
z!D|NN=HL{sl^Ay(lveR-3$u(?uw5W-z<~+LNtPr^4lmC{*$B&Lc=cuT4fTAlSFHQD
zMP4EJ+Nn@&wbwIN>AP<88llJQ-*knDqB>dRRfVc%=b-xkm|-AeNP(ZlLL%i+TEY8>
zS5>sH&7&J_r~|=fiapi4F!8uoJ2tM{#a_>tx1KudCC6HOZOI)k6p*2J(Zy7dVRr|7
zC8fN~OC=~DdQD_cgXBH+3PsCIb9(bO1|Xl<l0iefeDm*m!NVZRD7~O6_i84mhd_6R
zJ=uB;Kw!Q^x5z27X&=2FuxU$rg;xo_iL^iNT*w?)+4$K9<whZgE4{bc%esmF7vS#Q
zN-LWw8r0}O?{FLKRQT1kV#?aTAS@4RO1(&URNeakN8|acCYoPjH0`azFMr2)zh~9|
z*#_Q?aT;7LW^(ij0E$uLl1OX|?^6s&6Ys5I>+7yPydCkb`>KXB!)Ad{{dD4s>5$4=
zc-shlQoOe@$-1U_r?44p&QR};>_L<Vqr6w(iBEsmRUgF!-x=P00`nAhkM)M9v#5oA
z$?~p=`Eta$wny56$9{AqvoCr33V%%WPQiz8cHNt}PX^uh!%I|Px`K3uw>u*4q{(@@
z7vk1C$X?>hHq=sNc2RahpIq-_EcgV?_wK`}5#s%aN@fHJ?q1>T>xC=?9QS@*Szw*>
z?#DEgbIE(HlFq4|wQVNP0U{f#fCNa7<6d4u&s*MqG3@FG-ewm2LjLh)$BrHn=TToE
z$`v$1XZz+91xI9_cuU!N%-iq0+hd$+T<dAE_zvP|n(KYZcjqWhNdM@aj+;R#jnW*Z
zz|f0iBBB72=@O+8OrO2sm@E~a@YTD7F+Ba3_kC6zrm1|svEVkq(We%3=8f(?d*Rst
z(Wv^Fd}7S9V6nNefzKv(T((;)p9NT51bo@i`3aCsn?FtqZ@!wN6oOy84@@KJ$?KrD
zK7ANY(#5BQ89{1ypJe9C7yJ3B+1BFSVLo%QN^ok>(zFAND9M0tg8^1v*fiGXI2Ize
zowt5$3q{1M0Q3X=Jl-djG3PeJX9QAOOY$nue=c6+GZbh4ehrq|)&+;#=J6Z5R%lt^
zQ~6l&^3^_Y?tvDkBRBaBU_tHqHlHI*d|P(=WHK8LIq1`oiEr9zAEgZqJ9p*{eYyeI
z=ZP`I*Fs_}pr^|vK0Ddyr(N~=%9=U1e4>%5w*zf!UV}a!9GN{74EN)GpO-k=I_d}a
zM?tg<HeyXwe&5GNaD41D9V1M2-H9n9z`yH8ky3@)B$T}N$;HK-CYs0hw}6*i_D7U+
zzWE$u#@j{eTlu8jUYRdEFGH1eR_zOOWoj%QoG-krN1F)-Um2#3vb(jKY=KxnROnpu
zoO~e`(5^{tz7h=X{<C>7Ihd8&h!viRYzvCAA+gn>l)^F(-zCia>zaJ0uqblI+t-#g
zOD(>Y#qJq@-)2k)P9eUnn6&#v_(mg@Pi6a!o-nqT@W(zve5(p~YWh|#4C?Cn2IFqr
zveWxqCV`!d3`RxA_VDF|k)3=OV6i#HujuhoG4I1oe@H_Dd<OZ@G25j0p2ltYUqi1<
zasiKi<%C5@NOj0ino+(1jHQ~{zV+BrOy+FgEG9Dv@r8p0)P&A2^>t*a`DuagDNh=D
z3!}EK6Q@(5Fh_yEKSF@Cjqs?*H=l_!^SEzwX3Xc#`_^P8sJiU?jWunq`-Zb->}}t5
zjO?q@-^~S2d`;}vgD-uD=&50Ht0&M$^$yTlOr)wH&uc~53d6tqmNLzE`{~OL_z7_a
z3;Zz=Dv;BTmhsGH<J~Oj7=|C-STbWOSY~cDQ8pIgy{DysC2rzpdBb!)HPDj9%#I7Q
z3}De{Ot_^Fs&e}VyEh-fV0PmBXCyf$N<m)5M!5(fH7&m|^ttwn_;UpW$*g$vcrkeA
z*9W+<^`Z>IsM?l|tS0ntYT;RAi*94t%#`%9lO+?mCj%2K^{5Iet9&qLS{`TjwQ%HT
z1Hf69U||Qve<WC@v3}EgSnw}(h<_`j5mhZY-P5uSl{~hW<!@9S**e`)i{$mS*x{G8
zeJ#V7QcbBAcv6xY#e)HsU>yB*mx+&5U<vS-8Q?*bAd=B6%3esyu-M_}Yu%*puK}yQ
zB_vc47(_x^M8Vq@Lo8)<41UJ76Rja5T$PaRmxA!C@MEZ@az)N{n5FVnf|kQA4RGD2
zZsXOZHvr{2xDB+4bs~31Sl%Pb*hSn|_zF)zZGREWP<o8AuoWvIXtV|1!lHx78DnXV
zDA)f=Tlo%D*ja^K!XS8ZCN9cBsFrE@&g`JtILmyrG4T&eM2aKj$**{*`t%b(;23r$
zSnjfl_tzxLP9`{Es^tTlrGa$gFycsNSX2yDn`?oEJrTiwmD+lir3zxsm}MEotl`ak
z%P|aS;r?5OT?F~TyWQdw&_d}F%NOPbca~YUFjLK4VM)eVyGG9I#PySI^!1t`Qu4!M
zBZO?QjFQL_#Q73EUmLsG;{30z9xAF|8!hmnr}*}_cuqKw0E{o?i}c@Y=}cKK$xQm|
zSzS2)^PU)928Y3G0~Ui|zr|8fmwuM7aX)8<rDKP7qN>9L_}qwXnJoNaiDB})W8*iA
zc|y9K-!Ny{ZZU5^5A@r^R;X7e`Hl6Y2s_XHd@=zLR<sl^7EyXm!l*3242Jd1EWgUN
z`l2O%uUG-Jt;lZ?i#u^A{8W`O<D_35Z<O<r7k*6{hG*aWX0hqx=y3n1^(onxLpFC+
z!}+2i&BXVznhX0z_#0XNzc<<c2Rp2H|1bYlc=##n>E5*Y5W|m}#CvqjN!cGz)jZzg
zznppbl%xKYhjyNw^&fzfg|(e8!O|tQsD#V@tytjNe9iv{+g{l6$lvHr`FZHDz0*ZF
z57Mg^+1gc%#I^we<YHg2%I^IkZAVE1Zt5uD#F6^H?}MX$a~H27LAJ0BFku*+>(qVp
zD9FH_8qp!;r$(uSqmBVNtS)Hf6>yp{>lzfWk4=zo)DI|R%~_oTc3?)TaxAmQOE_ZU
zk|g?3b75i6fN$)$BZ0x=w(LmXg?;*hMB>bp9Qm0Jd^VaGFo+S^H#5M<n)mVpY_ZeJ
zx%*e%h5))Q0LA!n5Qwqeih#$=T54|&fY)p{iPmD+7I23}{kOXUaw_BXo&dOfK}DK+
zAwZ5tO%H`7rA+}=CVL^)Gk*fsE4KsYF#0Kv12(hV?)p676SA`ZW;!<$1Zf2mSiV{q
z$ru{tETnu0fY~KQZ}TOfHfE7sO#&-9u_ywAnT>bV2l}D7!fOZj84m}#?AQO#gvr5C
zQEK6lOW+JVu5=;cZee>kyPS|A9!(4)v7@7Wg-c<9-JR%)=%SA<sXEviyG#bP53V6(
zv<Qr3WHxpPbYU7S?hz<sFKm@$2g36AN|4j=8Rq;hu2xomt|wcQ{F;-ZC4s7}CK-QL
z>-X9{b@<B~ufl#fD734B_k|z0o|bdgdE2oYcW+LJv{+KF1l@^v-t6*g?TxG^bKGxF
zo$H%&Ho~LMw#`$6&d#aYV#lNX2hUeaJ20iyf%RugKi^UHa%_2H#qihP5}#~aKYXm+
z{k>o2x(|)6I;`*e^>+Kq9|jCPTA_$tATy2mn3ZVhec{8cNl)vHn31j>|8;EBv5Rej
z`n>)Sle~O?X6cDnwTG=}QariShl5UU%MQLY?)mv@)RPfc?)fcv?X^U9>V2{yHzB*`
zu$!b_RD&_APFziY<JV_U(L*<<ZJNG@()gr9eCFph?QANRSKG4Y-W2!IUe{+O47=T{
zWBS@txxOLC3|Xaq`<HkW*0S`>)E91dI<aE-t|@mz$M1Rn(5;_WzpH=5C^Qj+$2caO
z$hA2<hSUt*KlPulZ$5pRp}VU&`f_0QUVhcU&#p6tgHvOMjoa5G$$i-ko0#t<Ww)Jj
zEKh?k&PnQ<la%0*P~~9l?T4ce?mV1!fImO3TKK3MWBNX6Jf!cV!CAgLVs=`V>FnPv
z^{?$cz}Y4yZSWWJG)KAm#O~df)61V9si%2f{<vrKA1i8gpK&L|eC^o&YHl_^_lNG6
zcbBdByuU^8_tGun5-*HMjp*`a)eX%UqgURUk4rbMciVomr|ak;`or+F^u38&!Ws{2
zeBt=gdtrTRbdA-_i+VPp_O#>(Wmb6S<L?`tTk<w)=qlv`m%>2HyT%vn_nGERSv1SB
z$FpzQi^3B3jK6pwxUQy7L#ga;6W?Qczx#tGDyu%}RpsjRsUQD7KfEleimm&_wK-we
z-}Xv>BAsD%D5t-NvTe4@kOXOrB0=W3(4~BK_MSn?D~A*-UW7mD?{@O@5To;q!I%Dc
zmsM8yWAeaRcf09RZ#!0h_wY-FQ_Qm4O)RIHSaM!Pl%EXQnEYgBl+z-ofuU(6G2_Ls
zn>RZiz2sOi;nB*4eSTeUvi6fx&V6_HDz|Kc57(&^UAN`wDXsYKOWel{%=+>0&d;AG
zZmu8u^W$TZ<l%X1*v1n-zt2_0+K{9J1(OG4%{q9vFsngXX3Dl#+4m1cmA<b2&zRAz
zYjd&n&Uxt%Ctp6$Z~MVIpCq*>`G<a~`n`C+!tY!IZOc<dO?RKVwW7uJo*w(Q*BQKW
z#5y6hEVAbIuGZFy=GI5_Me7>XKjrc3uDvWmvDd1&Tb0=9$wThg<*Cj&KDhAU<n)rK
zL7rSf(|cQ&TnfBbn)OC;b!6ef+JB6$uh4d161jN2YWKvDDU$tF>rFeDVl=JKcgcM`
zyJwxARlS<z8<%*0xBR-lwDA^y&GMZ)=7jLyQ@^!o=F$7*ruGYhmdzZ{bVkRy`kOzJ
zs#^ZGpTGNc!`W^p&%3v;nLgHYVlR8o9rBeY#&}FDIC+2kpzUS1|91F$^q~%2Qy*E?
zKCE)Q68dINr)tL%)z93%6a}c34C<imWi=-2RjYiiJhs-$&(($`9m}8d?8~w1tK9DF
zmo**Z63s;zb0TK?4R0_mrftQPrxQX?El*9>IU2s-B@-tH`eqHBR&_(<O-bh-!E5|R
zI$RS<zOPJ8E$Gwd_7{&|O|q-x^y#y&|ITVVpAIY?_v)tiTc4h#9Rpw2yK|*@a>cuy
zzuI`Ly6{ffC#^j4!qERm*q4A)^>zPWQ|37pC4^)i@1T+?3W-EYAu4H5G^r3J6-pEi
zO0$Xxr9`PTmqwaXX^`fm`bLxgKKowIx#zC;_kW+~eczssb=JJrUgzw6&J8d2-f{ZP
z!tdK2suN!}JSs|%t3A6(zCZ6pLCEWd{wIQ(l{cN!t~_%5P>s%;c{L^_i5fN)1&Q0v
zy1vpg3>g-b`ZZ$bno{jG;(Dh}cn{86>GD)E$aKJp@gJYAsr=NXA2_cyYoWox_5Iol
zxyw=>-8$Uw@j$)h_05XfAGON#8n=h&G)IOPjx^~TRjIIG`TDvcLrkYk3YH0v*=)XY
zv%~F%IeRKr#Ep<x*jc};>1)u1OuK*LsTmKV61nzEA|H6)){b6aI6TxiVe5<%j~zE}
z51Y2>@YXpk7t;2BXYX$73LpCCxSqrF=fBqf?i2sU^3tmao6!2pvn%u4$}*DA7LJQv
ze{b4peHq<T?w`+vENUIM`ed5?WlIa6>Hl_~IW@BDW%K7RlXaqZ>@;lndFAKJg}SVk
z@!Rw#XH{ygPTYO?)!F<rd5?5|SUs5<JAZS9$K3#L^%XA~Hny~9oO^dH?1IBGwxWA^
z#-;MO@+sRAyUuN^Co&@4%KNA#Y2}~EV2y4}cbc~TS>C^X4H@he{gt(@#Ynd=%}aMY
z{qEyC37H$lL%$vUY06T2|4Z+@%Cr|d^=;aoeK{-tXXTN+!rB*!QmY$JEdD&+C^$)X
z)ccglp7U(fW>{{?Uy~)iXwRS8?Yq{>PM4e5^w4C3*nml$`-UDn>Gj8PueZCWn6ppR
zPP0X`Z~hr7^EC0q@uZ7shb}JOe|E7(nQF|wjm_gvS*1lOci-G_ddtAhsr|JU>*<#5
z%YLQwylu#}=LhxPKFxUC`9!>Oj>DXp%hkR{%$q)Zq};)Cx)OF5{kQvGir6yk^AU7p
z{YQV}*o}`(lU}-hzIIa2Xz2JcEcZmiX`K~^bMI_R9Cz7iY}n^!RpYOf7mw^b@o&Sy
zaPhlFvx;RM+MlN_naj!OztzOXsA|52gZ3pubb`rLD>3FW>%!}7-{~)HmF(g*Pd9Y<
zdIyfTcbGG0_>4Bag)@dXD(+jj<jdx@bKQRJsR-+wGJl)q)l0`7=gORwd$Victf2J^
zA6?vIasSoz6;W~4!`vD>TO_}HzPYP4^F#Cg#<d?E5<V@PXj)}XEHIsY>(TBy_YBGB
z!{bUMR7aKRUue8{Y5Msaw_R^*`hAUc^$wE|Z`I1T&HM1@+4cXX|9n3oR;eOqVDg@*
zP}|5mMuqE@2Axfso>IT0;QIDKqn$3gxh-y6yLfxsarW53pO0#H{p3_^-OA#!T)SU6
zuTkli+K||Hb8IZJKF!qJ<nyH;D>j5L^ck&iR`0(zdsR1Ah3-v%RZ)FADOqen+QOj|
z+aG<o(i9UB_<(qpW;!@AQm#N>Nq<-2`Zlo+Gf5?jeZy}axDajs<Y#QGM_Kfmwu;HF
z67H<C`io|-4S4?g)EBvDOAQxJ^|a{pGg6!!t8%N{R`QSJ)Inpz7QE24+dRg|FXBew
zgV2ak#O2Gu{|&G!-FVjht=Im$UHzXwe6GDCD`)1YK1;v9Q;OGW`1<Bxc37gzze#%s
zOVySRiCl8*#E@6#w%9B>KH2u2bjg|La}6h@y*v8Ne5%s6W%m^qNA3KP6>Z-qV2@o!
z@L|jS8CB|M$EYf|PCED^=Jn?94ndplg(W!c9eeED{h*3?g%L85gH{tw*%t4Q%0@kq
z-}vuFNZ_@E@zW|EH2?jj748zcM=i3k^H3sh#-F@ZvHj0`wK#6loyMsZtG^oLYhYWj
z*lL(f<=wafZez^&J2oSf>Su0R_c&>nMx}Qnt2;Nszr#vWPf}g{hOzU^8%ZnISEuOC
zUAgh~zz3(6XAViO>GP1=>N&xh=cTgdlg8ySZtCLm2Sgg{TaGYDk;uEa?nq&p>g0lX
zMZ5J92Y+>w9MD{w<+i~twKBC>>iw`E+JT1_jW#$neoXPq-3RQ}R`2pjGa1)k%}=lD
zWM$pm>A$i)Sl6e_j3IW(jXWsXaQ#_POyKvX`HN~?3=LG<$IGTfpPT7<?epi6Y0gdN
zYR88TxZ3%4qUGP~X`#j)E**arhD5qpkA0s#Pv+)1dA~#z`{a&)6LzFMwD46G4|jJx
zcp!>XZNIxCJ8ABLHJutWlJ6en8OWT!wrjd!Wxs>NidQ|{Am@FyK+8Y%>g@q~>JO%s
z^qDf|PTPa{^5Ihwy0e$R?ZbKP?)ki=HR!~|^12-t7P)v1)qXL}+&v-Kr0+V-%ru+R
zTUW1-yEc8w_bs-E79}d!O4SwKA6GQNKYNztrL|sG+V_sQNdFr=u&wy{C9N#KRc0f-
zG^I7=-i3{eEO@pyXN67mylqCES3gHDU79Xlw8q9{fZO6H5&h!dEs>eiHOFhMx4Le^
z0N+U8eOE?bnz(KCv7_U^R7G82FT6C$kK^Yy!uZytl8B)>cVu;QF6YYzCN?z@PlCLY
zJ=#{)3~H5}y?DscPM6O|O%B#-joLS6<3ZCg`rQA52E<2Ie#*`?v)|`wtQYJ!%gI0e
z$sW}Tm)D%KE;;d<h>5m`w^a-ocs^&o%HQ?Gbi;`U6B9i~6X#bpo;Ymr{;%`3L-NGC
zhO$h{Oa4RpEIGJkL)IacrnOwQenjF?oxkm$^q#og(`>FCd8XS!Ce%KxqTPCfnEa2M
zQsY{RZcPl2{vPF`=W+CW<&Vhnk*?=A<~t71W2<yoaV2gm#+DDt+-vc7o#F(usb5b>
z*RJ2M^ya|wt#%(@G=GoY_Vz`7n04!-bu%w{s)VP<4a+U8C?9>mW%PfE%N+ZiB-~SW
zCVKXL-)D2T^-Gt#*Gjqd2L@@CJ6*f-_p<thjg4M%SF5hwFQ|GP>7O`dfb*1(DH9T#
z3bx!;UidqpP;9$X+Wp^E<^7r@rgzU5^M1i)O@90_;h92dM&tej#h2%`7UV2gaQUdc
z#}&I1`bTRX?N?1xI<;UI$9JCeG-=k6nrrh{A8?6G%)KBUdUM`7<$ZB0tWvZNUmQ8m
zdAU6ilU)0ig+4=fq&D^T$o4%h6~RX=y)PekT9m2u*5u6b<5Rbtk6&fc@Tj(Fo7n_j
z>o=?Lj{OS86TB+=FY@>JnD*=XCZ{^%tW>i_Km2!OZ&OOQ_KyBxX(xW`*D1L*#Se8t
zHo8y#x$CjEUss}mWSU3r&w;+Ba_6=yrD~7JoW=5R8#^*){<t#h=Ox8M!lnj0iLGik
z3cb-k^mdr`$O%`fPCoj+RlmdedPjb$M0in#)SZ%oX9tw}xS6;c*ktHm+u^iq!<U7I
z&X>hBYPR2cs=8u#_Lz({`?gp}9^9n1!rpv6FH~tz*!x|LrJe^qCLWxj6!LR!oVt=n
zVUq87iAmKRRpL@XhL&evEC@3ipL!}}t(<kp+w8wKtEXq>?HpIaElSpTx4+LgyBViT
z-)30qvIdMb&wIXOl~T6j;Vr6BhhOFWcrhmbr=85*K^yW$RlS?K^Yt-3-tw_F|1Idh
zc*oe?QmcQ&Jl5RRJS@yDznt|gQF7d(fU4b!)#X(-O>r{e|Gn1Fsqr||+0Q+wb=&!)
zThG6Wur=y*@E;p%%ZtsqlI9zC$m-mXulKGd74LYuw$0CE?XJkRUZb6Ej6eJ)Amhgm
zscA>7S|#u8U#sk^lj9Y?T5eo@kgN2s)wu)pO6G=-`}taf9lmw8QgMc^zgETLf0c<^
zvvzsehs`Y(SBy6g_6^V&=@mSEP=617lbn(w?xTfiKP(Dv7Fwrdc?I+_oX{50ziHWq
z(5`kFwb0Se^0<ENY2{-B+h1Q9k?oOnV(T?~rC|2UVBH{|*%J39o9Z9#eeJefhDc@&
zT9KNp%f75Vu~5D$uxMXF*wVs_13EYUEg7EZnVF!WZJHGqaxK^AV46(pxT6<TwN7s@
zAKPWN(%5K0MRWAw{atJS{#STnxum!0M9udmzQtt=YWu5fIPm)S^sr0vI(_<_ym#iu
zr5zFx`_tEF+ej{N8-KtzSo`ezH#zs(`%F5eUjLG{PmkCxkuL76P-&A~Q=2?fsd`<@
zoY`CK%o0OblGBceeOW!H`GmVZcgTj@2d~FlJeyUtzwOuNt)J%^t<2xGXzkt`Eoo{C
z=6<Q<)~NOKKc&l6T(W41l6mvhaJjhcnd$q(vT_Yx%YFB5%DuP#g7FsdV>XitcI2|E
zU%w$vlo)%gSyub*dE;C;oqYLwj;6jt8_FNgIDF&Lq|}?N#Es*McE^n#ShaLqwb5l2
zvxs>P%Jwoxm5v+<Khj5IQrFtui<R9n>rbRk^t|SLX|RIN5811`*^OW2A3t+;9m7qX
z(7qr~_q?U!qWD!qW$*r|AL(W|`u^?lK2hAc@fw5e<eR<*7Uz_7-CcTLJ6>vC<HlKM
zW=`~r%X0D8dQ@EZ=3QfAen$KWu_%cjh06v#Jhfth$HFp`+ge4ty4%xyUL<A*OnI@{
zIc&jy4$7zcaXae_R{UPo6?ZSxM@xP7FHVT-x`s2|6Fes6J##L7Fn^L!e~ZkVIrkm>
z0+#GQRWxz@59g^Xic9R)tzD?}=%(eH1*LgEY9mUb=D19iEKt#TmMo|ER4dUV>f(CK
z9JL_R{wL?PelGfP&aK$Bk9*q6wSKvkvcnfGTVc-$i(da{_LFrAk%JZOO!BOy^QBBy
ztGtaJciSzYd7knOi+g8+%^vK}Y*~MIZAi&vrv-C&zUWxE#7X?jy5iOC9<zNyYr-F2
zGO&85*p`}H(zT><aiqANcHRyJLS@>y#7EB?o`!7sy2&_u<KHJqK?;jkcf@KZXdAuS
z*H&BgHfL>7;Rx1;7MaKX@*l!Ax*KokRGleL;+AhA@+Y!$V#U`kIbXV$tGLF#y8I8T
zg%{apHJ7`3{~BB0=7+ZG)f)%h4m7ZCpO^RYZPL==<sEYZQ@ss_l;^6?)wmY8$Lx>c
zs>C+c>17|;oo2VDKbg6-V{iXq-3jb#hCixn0*)_DOL1eDxw}lcTac6gan00Oyj$gt
z1O5i=Q2)vGEm`Ez{@mx{wS)8X&MlpIBlniem{JAHv^V}qbKbb#(;7X~e$I|XpXM9s
zznXK%c-$!8hBL<-Wejbv4$W}fHoWEB*f-ynK73H3oY;J#ZHM>m#`qs?FV2sOdU01x
zdH6)HiZs{4ce}Z(2j73y`Lz9B$;-F{mkz$&5G(dQu|bmvS-Ah7>kX%dWfGSk9Q^mN
z!EoQ>?h3Dd%Xb*<G|nyyFiQWaRBYgNC-&Gcow_woO_$!>xwO6BWwVi`_#LgxgEiB+
z&qJkVs0?b;7)zui{@vrhMEvLiuhfcqnUpOBr5Ef1Hw3dbDc)(kk@Bx(h@a%`AgPw;
z11;~w-mk4xU%q>BSfQ_{;m<$SGbM|nD=K#{>X3e2c<;d6yN1D~_isst=4zB_PV$<w
z%jw05(%WS_j~}S780PjQNcz*vqH50SQ#tO>Z1(8Ys!X_Dx~6kq|K>!k5BDXHHx_ua
zisnxJbuVq|##Q$_R>?})_8-uZa;JRksE7T;>KlXRbe8sA`Efy?jDtHl*42|Pp4t-`
zc;Sqe-TmS>H}fXC>AhcaeSPloNE5E<io^FGl$dUuzh>ftB6+T2!i>ASlC1*tg9hC7
zOLN=eIU_gskmdP@C228iPGI&#t;8C&XUeU=*w+^*24pXqJjJm4w`;fN*8So?COr#y
zH}Q$fK^3`TbJn&(my7o%EFN;-)`4xEHs!|UUC9qBr6yHMl;lK4NM8IH74WHWWAMq_
z|7|)`>G^k4huV+Rr4zm=?@H+C>vcD4dCQ4cGal`48NOzfv+A#bRbdA_Z~va093o}w
zl^7l|CvS?lt!Z+8KvMoO1zXb;&7}LvhLfH}%w1p|pp^SBbjG|g>yPcj`n&ko^c`~S
zr9XG^i#ZEtZPxg_{z~XJx#jh0W!Cx$Mb{QwT61G(*0FONOXX}=9`}mNQNPhmbg#>G
zB)(sFuMeCWkm~ufsv~YxR)yPV$I&y-9UDC~UM?%~>VOXoHmvAX_bf(d9XWPRWzrA5
z!LBmjY_;w`==<TV?8-y`QnFhgRNOt0R`h16Sx))PDUa`}jh!4&aQfD^Nv`MA#tyrp
zZ2$9=Zs7fsZ{5D$lZ?{bRhG8PTrB9=dQ10gkL_CR%MP_Y+Sg&CyHfns6^Tjhzoffl
z?O*D=o$tKf!t_SsxfKx)(tR&A9Qp0W%kZebOk{Fh&-+_cTh_(Q{Gs({Tu#9D?1<cZ
z6T{`UCTAXdZ5Z%ybinspuTF1?f1@xkpxo5>OzF&&L!R$L{QaW6WxeCh|5#ZRsD5zR
z(=Geo@6&lx@jj|ggktEbii$$p*Uw{IuXR3qezr_TZSb>dla}I>;}Z`*9dPr*>WR-^
zwu&7HJNxDIh5{9nU^PYWT8Xi58*cs5=l$?;71Mp9weDi|sG0vZS&i~eSrDl2mh=9X
zVSl^Px896eH#dH~*Ayb`nZEUZ_7(lIoTqWl-|yI_>c8rtukn_SC)$2LN2oNuOIN!-
z<8V>MxWh7!OzN%<u?$$+X;(cdGBIdr*T(O`xy!z73N;w*UmPE*^wRgVaaqjP+wYIx
z@cb!XG%ioWa!|<_=eHBbpA9<s%Y>`saOrqP+*+}9jjOiGf5;i-DE6jUp(~C!oPS1Y
zRKrxKvl@$BS6N>f^!dN~!$W^2C2kJ)n_pdjFmhq?J>R)g=2!0yi2J=)`CqkVNz~}>
zM5W(*9=reg8T_rP-obB3*R7{!+~@|spr`$x7k~e=aDu1B1e5k3aowRizIV5JXpAz=
zjQIOx>-{&M`jq<wJ&BHMj=tU<Xp?ngN+fTngwDk;?(<!Y(+`g9uUb5<vo3o7K0l4`
z(`GChduQJ){dWTcYJRql<vN=`opsM)!mYjU`$l#o{yCglvdZo5R)6_X9+fTMPS&jV
z_j>H`R^y*(NX>vF%W~%WCxrQT2HJ16PTwEutJ{$NrbBkh?R3w-2d|g!@+>brtT!*a
zVOi1CRV>fkYa6^S+0QpI7-2lJW2weDw$}@TjbibJkJBti=I_3?$ozCbTlE^x>Gvnh
z7pt24wD{ZYVW$(ToW#v*YjyA3SgNh*yQkY^%>JKe!luo;T^hE1;jf|nr7Vx74D^Wj
zvu#E3*GT`<`8#iKyXrhEx~s!HJE7go|Jxw#ZdvQe%T!jRwk%v#>0_00Tlc&3#6j|o
zoo-Izi_%P|<%XR3{4;8S_2WhI74^f{JsK0U#;<W_X!Pp~sjXM#{S$rTnszn!ks8ve
zul6eTUG2ctsor^~4*Aw3kDYi_s%fO`3B#eIpVTZpWxT^??~tvrN}p_gAK0pVweXL@
zi^IpZ&i*w>`{&9!g_kpq><sOy*H(zTCA;#~^){=R`;x@Y9p`M7U0%*<ytCHn)yw!n
zL3NWy9eg%>?C4!x!QO#ZVqe(R{SuRYeEk0Llk%uP1D@ocn!oSp-fOp1cet*STt0HC
z?GVme%f>64Uzki%`nvE^@XF^VKXd=RNm^AjX!YiQ$}b$<WuAZgw)bbq#&kp3KG9N_
z8`NI1O#7XSlX_Ys9%;Jn^P#UlW&I@HTm6vTDlyhOM@M$#wo|N!KK}w%h}oRV=hRr-
zQud5_9J#6JdCSDAi&~qcKT18-H2Eu|bUk43v}Yr1ysy8$o~*NqCI8`N`|CfgQ#w9;
zY)@}UPTxz|<^5gyiupmT`P)rL2wp%ubkua4VAsj>v}p?e;K|7GBNa!Z7k}c$lP{{9
z6KB`(23HTeWjaxC+9a{X^ta&j-KuY<nu0yoZ4zel1>4;6a%TPb?|*4;tlB#mZE_t%
zhptP_1zWu<G|YN-)Jk>CB>6jP&U==}3`a*|)@bv0%g_e5cD0PLnYZ9v%oHm#ZT>DG
zvC_?KI)B6NnVXq|U>i|qf|-F}?{7?ynFoJkmze*-(t+I1Rw1-gc}m2WNHc-8e<RHf
z^KBr?<IJY<*(t}HjTG$I`Xrf+5OB^&HzNf5kH)*rjQA%QRlPPO7Nh<6KIn%FJ55Ga
zZ!a@z5bXQTD>pkO*t#@5V)j|^rk?gWGxU?uc$=~8qS;?QErH(pp2P=bu9y|`-5)nW
zN2wT{`%zcv-CL9+e6E<uq2JpyJ1cPUwdZD+_{Stmw>63nM`y~$tMs}UlS@^%w3$tm
z!p8%uxu48>J|d&pVRoE<v_)>~Bb5xa9cm=Q{{?gM_yBP;8*O6h{xHiED0`~YEQL@0
zaZ`3v9onys=irk=)zQDrngz!B%A0=>nCz%zZXo#QY!&ku!9SbT%$xYT>O&8hC5Iu$
z`HJHYE9B;(V~}5mn#T#87d_0pTo&(-Zh8A%b_?3!^&*^g&@ab0jWky$>hjSMm8CZ3
z+xa<DVmSZ%He|pz!S5F$nhOH8Wt_~b1ff3bX}(vGM6><O(JNm#2-Ez{uL+bh2{gaU
z|B&6VePc{j(P@&`M6{ndyV_~0xt=)x>;(EHMf6H(u({wz@G@73ndZ(g*CGA|n+wD!
zgqUCC@5&$lmVc!QeHY?r48MMKck%{S8_h6p6%>uiGIMRdLIiq28?)_~yPGG;IP1?+
zo8;LKIiSLZsGMzNPN=k)%aLS6`fhZX_(6hs&tCiQb><)Wxk{8CGIt@&x0(y`)qSgZ
zIR6D?@-Kn-_zEFiYW{`qgI}fQ69m=kQ)j-Je?oz;)AR;&Ervd+4n-m4&$3G!@bz)p
zkJMP%U~Vi($pd%HEd*tUO3Hy~A7!CIm?EY5!t7hjd%k=({++qK;BY~DxA`#sMb*`c
z76S#}QsNdTl%lvklqHMHmd{E15E94Hx9Is~W&2PIU;f#cs@RiZ(kL;b&{uWIeuW%0
zusYY$LX&@dLTla`X>oK`>?wMMxYV3WY<_31S}o;hp~2tkC+pvV&^^r4Afm@u2$Wql
z#sd8`H<>7eqn||x|J2cLKa1^x_TU_7k;wlJLFDvBOUT0+>9YL3RZ)R<`Sl_!-U$*>
zX|BakL0{r6ws^y@*%22Loa#`s%h4iBoJ*)w@cI&mj-b;UNpTjJ1-apsY%!JZx{_pz
zJw27S)#9hXbzUVFU4nm(t+!Y#_~)4i7B2iF5>FcE?U;{}d4vpkL;`zJGWVf{3W3g+
zC{@pFwOGc_f8tr2MKE8G<68>}DO`QM$C=2lb)#1czFC+Ng`axdbLf-BIYIHnv|Bjw
z549T&>4^0~=SW{DV0pRq=p;*Y4LXUX@ZBOz;J27gi{S#tl>fFsN5)APAD6QnC|Fu_
z23r0T)T*AoWk11TkP*Wy+xVxfs-91pT93SqzU)WZM9D}nv25b2@y)L}+!&edhmLUz
zksPwH93imyy_Mw@zQtp#E&uVQkQIVnM;vddDE#reyuY?Fr%@Sh)FBUza7hXU!toqB
zZt{m~c|{P~E6$e5f+HI~<1J4MIHJR56Zv8y{VgX5mXo?r%RoUR7_i7Ph~LQuM~9Co
zLPBPlkjHOqiCve`VY`G&C?RgeS(Xb{$geeu<m+sMbtszZ#uUrNg8K8=XvyKbnQw|t
z7E<(fX3tXlEYtG2z@PQGmYVu}#h%qz4i(g&%oWSc0&}+Av6SW?nR!Ztr{tkSQ|qnx
z#e<plxsDP@tl4r8z1+{<;A!;l`I;p9C=UHh98RL#kCtXc$Oohk`8RTDo`v5+cy{W^
z5b{OQyHdmpXREP<?kyhrjm?jiKl!KaOg~v_369`-e720_gU3EwPUeHvs_mBO4bdK!
zfkgUOOC>&0;aAHH{`q3UvePnx59W7TjuBTS$!lcq7iOZBNCGXMA%yc+OHU&DF0X&J
z-)~Fw0e!qkwEwXT7i8`vF{_^Q`DZ1pdJYHelC_#4C<JK*t9|DD%gXJoWCdA%YNS=q
z7jnlsTBQqUA2?fm;2*wj4cb-ii?UT&fi!+HG5iTyer?8D1qmoKJ+15o2U~CWSyc<r
z4FOhH*nC<3=~hO9sH1dq(TScvXThw+jFE4~ndyC1(TTg{F~sV7VtObN1FAJMt&I5t
zM}o>7BSRG96;AvNKfBpVrh4-(t5Jf)xO3R5L~!yh@u<~Q3CTg^i9_`L7xb3?FDq3-
zs~Wu?gZ(wE+DcX6FNc#>rGkq1blK{#;0JGaUbm`M;dj&Er&j34s!3Ky^R_wak_X`&
z@mWaBw7UGY6;IIfK76z4!#_27<kvKV|B$yVrTI(H?R;x_qPUf(dSL7?^nqi1;Bnkv
zt7^dse`y)(TK<K04oBsZQNxWMh^LO~b|vcyek*>`qGkCMWx#3uUY2UaKYi;y)!+JA
z3xX8dRIk1eRo?L-{KKwDET)mJw$Zju<V%?M`-+YPlJImWvAWLMu^O=$B<N&=O|6Xu
zpW61awLZ%4&WfSix|+~;c{1b4Lzn14C*oP{<!t>you8+}1FWwLI)+@BHTu*DHg45y
z>qmkjo*HG{hhL+6H#W|{iW=`(4I)O`#*xVU%#+Oh%#*6Fj<s$Q1TG`qx>?|$1xePs
z`9FmI)kyN+Vl<swM34)oBVqA{Cqvx$g1S!Bdg}y1Oh0E?D+*4-c4u4X3Ids*Z@rZN
zMpDDn$YWhdx{4Z~$*FzTiq(-jt)=;Y-ZS7_4cWDx=#uRi-Ar7t)~G(S&pJV1k<%gT
zXX3aIygTVy@CBveh9J^|V+fV+=)C*N!`5AbG>omYR+8pBC-s7L&-cEI?^_=gyr(t2
z*?N*d>Do8e76NlWzO!D-|Mm3r*{`{Wkt(_9XEe#<$H?XC)u%sOUllaa^dHs<f(EG2
zZQaO!2f{e&_#Hj8;;6~dVz2xQy%oZfv@zwsr0aTS+@|xWle7`L$BCiEQr~)Tb%nIe
zeL)vVleLK#q_Ui*4Ud0kl>2zl+7>jGq_Xk1Du@<a8=dM*0~_@9QZmxVVmt?3M``Jg
zehs@fJ#LuWyykoL-lXQAVMwc38JulI=3n&cj-s8-JVB)5xi&qo;-!tWxyV1!tuTFg
za5dsN6Awayl-urP(<G%7kB$}I<&4T4sEa;4*-FHy@ZE#5yc@k$bI#qyP?w*8A0li{
z3sRtLp3ORe=eaR99KoxFF^M+y{I8Zewf*Y94_VkemT!WJn7w*+Qnt-W!RO%}3vI&q
z{$nQ(k$#I5`$RY|wm?7UKoWl5Wh23VC8zJ)X+OzVp!HM)b1?a5#{)J>f=)BxkPVs%
zae2s>+nf-b`Mz9fGm!7AJM)4MYoexcT!O0bYRhUHa{=Y_8k=&yq`ViO-iRa07ZSKl
z5@%)Xhg45LYoo$XQ0-SP-#94K4V7^!<jUEr6VGMr6{@eDvvCsCnbKvOm;BqM426!l
zC(xUrM(Fo<!qHn_h?!3HsB1Pi6!|%<{LtpGJpZ5duWU2~fqVSMW+;D-speYLwIH*-
z<@xSQ`fP*NruR1JMLBG%$7dU3!6LKltBtDQ7X;G(+7t>@F%!4Fzz_WUp$_kFA^)29
zBPR!b;ECr-_R>VWvb}7zcOP5SR4`K$1>0J|<!@DN(aX%_LRp<Rz_#bB^`{5fs`2k~
zlC8cqtVG^uSL8Q5Z9Utq{J+f7aAS9(VWwjQzhus;qJaG~v_+q`$Lv(BY!msbA2)L5
zg%@aDI=hcwkt8L7C}jwzzDTjpY}*fl)#9nUtusGJ<8N(wO};x>)R+7$33?ZtthRaH
zwtj-Nte;@(A$aLSX^L&4Ajnx$ZOi!1=(y;MKH!4-6?(0(_xkiuTYX9Vdm47CCPq!j
zS8s_s(gOsw{CkG&d%^pa@8;M}6a3R-p>2r3ZHJcFy7J!wym4d6PkR)#+w%OVW9>%7
z+xiHimXT;XOwjpiQ*6}*QNOdnwpQ@c&E_q(`Xg`)2$S`Q-HH_5q(Vily8DK0nZVGB
zM%&}ce2>n3X?vgF;~KWa7raLoCG?|Nx2esxTX0e4N87%Fe;#bNbrcBw&}Ew^aMb!g
zw%Y|alSVRj@%(lDfQ((MU<%1pwaXV+ZQkE*v*105vzm526U*R1cIa(5JPvF!v<o-G
zwluz1TH%Mh>Wp5ileV!TGN;=~S9ARA=1F0vr?;54kgw(TCqKA^KCfY9uUp+7Xouc?
zAhUA$zpi<<$fv_)_}^kgf7Y+wGu_UCUnpiNKP~@5ZC!2<`4K-<Xh!yW2P|jW@dNVk
zzxU5fP)%z|(j+gNXXh%gw=T*KJqN&fnI3I-QczKzOYAP$;KTQox_`q@A>kJC{D&mu
z#8d5i!0v${bf$;w@&)7kl{&j00x#8Hvg^533~R8{7x-htUAyD`l(JE;wdz+mX(-b9
z;Q*{v^{Nkc?t-+q^3`soyyW$*7lR1*zjkiYhB}KMO5T4$IP0^uB^RR41Q196+WAPc
z*vh+2?>!+@BM?k_o7jAwtwoIfXJ;+VvZ`v*Lx01UkRBuo|JluwPFlSvbmd(-Y|?EP
z48Y|gFzKHhJN8s**7V(Hg;?iQv!@BOAZdvaVh9!?CeDblLxriadhX)vKmfid&g6}$
zJtWw(0qTbm?8yK;S&|Le_*9ZT1)vU*VuuLB*tf5x*g=3>Bl>^}zV8E?Ge?>|Ntib_
zr%f917LCmu8K#2RvF~NrGlU(B6<jJy!t}V4)VXqqhiDX5%YkZ3$uk{@S*OU;Y^T!g
z<V%hgp{KxR_lm`T!zO)Q!PO%26xa@cU4IqW^8q+Z5rhquK)6PU9RV;Iqs)fV{H`*4
z4nRFig*_91zpAk30q|N?_FMq&rv~yqpvGPXP`jzKR|0UGI!K+;7t~~c2B`Nw4bXV&
zejt3hAE>uae|9vaV1JOwk^!_z#No#|>gC4MBD6KxBZPAg|1C+AnZFp8*FrW4Wh#aV
z7}g&MQkTQ<h(YXdfcHfh_R$6ve5{R36;*J~VD@qVmem0j%+X=T0@QZ8s6`3!#)Vg-
zi~2=RO;d#z8iR^!f*u>~meA`R7v51lT0n67$8K~P!t?;9zCMJ#KseY~Zn!?#=tX$9
z>VsWljsZJTm<i_HWx&iB%zK?7f<<|&4+YEoz)+;OXlt}K0-JQT5j(7xH`R=shJg#f
z#bHbZvEF{h?1g~Hcg83ydNiSUFEIfX{Aa>m)JsjR!g;1(aaozsV2ZcT4pc=i&Dfp*
zCx3JHVgUYP&Q1W}YzuaLFN_7?SYQEumPmkTKrUE<jVi(l6d-TSUfnAI+lvKMS+kb{
zyeHV8n(6T+<(;oKG&fLisx3%uY)7M}oK|l~ixb6U92*SC8#c25a1O+CKv<2-j_TDL
ziwq!NtHt$L&n4^8utk2({m1;73>Th04_tuv@xU?2)E-=7>g<^*iFx}EXLcm4{Hx*M
zqOoiQDvlnfQ~v560S+IlM}qaJ;Q-cpkpry?FmJ4&<tTD+U=EB#<tUK1n<Gk=9tEk$
zy>w)DG;C#z6O#!RDdmh56cw4{%#P{hO&M?Df*PHu$VwM5{9|0f74xAhNWEY*sQ2H|
zV2C%nfv}!C4W=B6^wuJ7xU-!BQ<3sogzp%(zp!E0JJ-h`3q-vWJQlRz+gQ+oH6Apb
zs5tfY1mXR3n6kig0@BjYi#<wM9jxUEFErbUYUw@>%)R^LXrZOlS?JB4-s=HM?oV&9
z9+!`2t_A#T@&PL>+lN+4lt^t~&;#YZv{sEZ!MzcAU~+G7jFIDL;|DOrwF%&0JK2wB
z4i<@-wD^(fE?B84E2Af(m0HMy*qrYZk%B@nE~Rz;jD~?fnL5kA;_x)0GJvf?*aomC
z^!k#rw={rp9p3`SNwjW7>4X}M7STS5?If%LR%~)0Sha5hX>Em)yJ#}4^cezlr+@-B
zOaTQ92m%G{38D$0oXwfa4(N3@rOwf*OeQ!q4%5gOhzwEn)0mZsW8e@>yJMu11Q+`K
zV6xEZoUqdzL%?Fb9Re<ZQ$xY(`4Gx96|0jtotCqdsajzm@1ij9e#AN)6j>h*iVT<m
zifo?&GD(^VR=wsdFcy1fk##ImCN>e|EJ|OWuw&0epqnvKICwS<rV94=Y}y<`xoT@9
z2%FAf_6YoCjye)%1;-B~rw1zIeCE;yS_*zRm%O>4Cj<`K%z0qz=$Hq#@zwJ|YUL=9
zdPfwq|6q|r7l2PR_AOvOy1>+i3sC`yF1LFYGN*Y=JuDhbwxiL^hZLCFHikLm<5-@E
zK?xxgOHA#u2)vcKv<Mu_ycg48syH_;M;-0+Vz#|-EO7PvEMZ0t%YC?nX)2};S;`D3
zR{s4`Q28axn9C2Q{<{o?NXS$Sr^GVvM6o8S%h?lqV<BYIL}7C<we$+6-WcAr0xW#Z
zmCW47R#vVg*L8Z@V%RYbtrw#3{Wx-^7NHJ|XEMQ>w8n#L(eebQCYV}f6_XmfYVRu4
znS?xu733x&CPJEECbtr4xleh}JBfKeg_)d5BBLW>&Y0D-DS)c*%d6SwBZG8mEYf)m
zn6?*T*dv*?)L;cMlZ(mh@x76!8rF!lU~!&+VRi~QIiE-YnXuP^{owdIklJ=V&9Rh%
zmFq!jiwz)r2!@BHg1q;qf=u+&K<eFTAobvMka~MMNUgaMgmYn7eG^EX3B!sRG-@jH
z^|`2F?aN^EfbdIhru8_=>FJx9g^#mYH4_|eax!UC48^2x7MQkqFg$1r$Yk3VCKIfQ
z&Q?S%S}8lWGCLEdHpwREK#{ttC1KHp$ukGk<Z2GBt|%+ra%nKt50KNLQXWBo_uX~q
ztMY&*&Uqm9`8<$%Og@tux3{bL%mo7Xg9!ypY793OfV=~@fz*$-fz-j<nbbH8&$lz1
zAr?8ikV%c<cZDGDs2w2nmmMJW;+-J%4;YT!1v2@)3uKb88>AN715zjNVdhCNxvAH^
zhdoxBHS5nNa#Igy6^0k?C7;ELz#V(Zv=@P6_Mtm9QTY8n(%&M~i;9qfJuuE#Rq_|k
z@jjx`ezvu=tAF=m^B#3@4$a=ryy?Jj=YDorANnAT!<=-08D<R29Aqxx7|w-ZgJNdM
zV(O}5a;>IUGKR;NprKI|ZYm)ku+yor+LI5VrYE%E<Ai%o!b0r;Q%97N?h>)`Z7Fk5
z!PIlgP*Mm{WB8K@jP;H=Olm?`5W}5^nNQ*|99PbSG4Frnq$VN?CRRWS=2nuE2b~!6
zNUCJI1WS}Y!kkku_4*@Bmta`-D3dp4l6n+WTcHYMl2OI92J==vMluo6dkYL}9B1yV
zU?v5}K_=SOpvWE7WD1Iiv^W7q_y`PhYC!7R8m1=LRSvbJPXzfu-E5wzW!~RjBP-V9
zBpT*hms+@32o>uE5*F3l@f6!v7{(&2F)WgTmUU!Hp~KjN2kTH56XK1T7@THOW4Q1%
zsfh@a0cX&}B*X;QU+x(chMxMPR#M!Ttk0sp)WeDT8{UlGcb2vpjaw)dE>;ivGMR)$
zrAVD4y+c<OGf6l{hED_*J5TFnSRL#gq^g&&VHg&_K+7*m`QI&Qqo?o!d!(=ce2Lga
z@b+yJ3HK;Z%^;eWnE8Q~-+PIsJS7)DU2?w6e7b~VP;!~J*MS9KbsVptJ15bIHC|z=
zgXM-?rQHQna`7B0b`8y;LJq}p!>%D!MGb4cMhg~Y@9YM&!iZ9TX<#;Rtj@CQ%vAu3
z{C%BCjo~#nm`@Zi6X~1a24?n6=F<vHt#ganH!<&>w?L6YZ_`9l{yKD<?E$!I&>hhD
zeRpU~C|4ObqDCX+su*+*i70I(XWT2N*O~~m4$~%P%fot~Y+^pX!m#&Ua0B7hUFL=f
z_Q{faOb_xi{XSD9rp~%gHX4z_Q+z<S4mymPq(2}tj^3&<lfDmWiAedR@FD0y!$(Y4
zVI~J3kxQdMB<{)h*`4KMQg6YtJ1S^6$sawYB{|i`?Vq5m5Q+sBQ1gVTC5By^$(f68
zDu$byNx35MjHlq^lwVJ2jS71JGuiMA<gNJ}oUY2Alg&s(q<stR@hZjpN(*!R!^z_N
zg1HroHF^Jn=3R<O%u6N{Ox^X8d^9GaN%AX{h@!COYY;AfO`gUOVKTlIgg>={j$QkP
z=|OCzMjNvuVt98Ovw*NUwr`mVV(P1JX<I2&+D?DRtSe0Y^&PWiV|d+r=EE*5Qu70|
z3NgI@12{lie`KZ)=6&WP%0baY_W6XK&Wk$s?I+NKiJzIqV@))^fH_$F1xz+xJM$qO
zW^%h7jOD_w>{Wn*8Xcg5r5)sxe~~=#{03V2>Ki&aAxfR{9aK>J2WVyG540O1%4FPs
zsCGo*H~%ruSzzOne}W1Q>LiaWiI{V$6AXi27dugy8b`jX3*4>G{RP6dzroWa_kJ^}
zvEGaRfSSnsWuqTWrR$9~Ir*2JA)JF4j`_#V1z^i=c8V~JMK+VL5N{IBN#$%3MTt>j
zApA{?lOs%x6{!&CKoWx`IO~L|u_E%494LFrBsoybyrnqMUGNJj4%7x?`*0Xf+OdL4
z(j3N-Kdi}qX%3Wv&N3XR7@o^;s4X_CeJ_yZFcKDvl#%05=enq+bc-AZ?XU`#QVLd<
z=b*J-WOr<nJcqHLfO!v8;81ft)ietgIMn$UO2MIu9BNU=FrFjI6giArY0TR~i9_w0
zP`rzkIM6iCR^~9y5@RMOm64S~O%uaB6%J!S!<w8_A$=k;(eP9`(1VWis)&h59uw|r
z9O~|cF{8Xz<4}V*HKUAH=P+gztmOrD4s|#dQ)4Xy`*N1{YDvN0`*K$F!jyM&G{{7x
z+k}~z_2bL};F^9&q!2aMWL$qlE#yX=wvYOA7#l5^I&1(EDMXD$b`Ib$ZtXGk22Ey3
zU|3g+vjkvrN{ePZl@(J4a-ajlA7D6P5RIA&gR(Zt&mO%gI9D54DGCoBOoJ&+_7A2p
zp<r_z4t1W5;(bns#)N`BbwRjM7ZrYwCKT^5JsM0gY1czF+e3|E9JEzKnB4>WTT!1O
zQig;#Z16KBZrgJD5#{<EH(_r0vOxx%us(u!9Y`7YszWy%>WKwxbk%7CjyrIbnIWk0
zNkh<gE<<VUgtD(;C@M=K`>;lnj5yQ*bc(voh(=BMZqYEXo%|a{6G>68H>SZ9Zxs`U
z@A#*4@=O>3z|IMuF{qF@V~CvNWD2^p%#<MnU#8Xf(k`Oj3|;1H23}TYMspse+h}tJ
zH@<E+%)tmuv!E3^#pIm@Xkd&b!$3aoZcCabl=IeE(IP<cR<&kE0C$T*Ya~*%6B*jj
z%%ON6wn3|~XfBMl1@*peOH+{41bevd%dwk;t^x9JlpUD0PwYUS&1Hi^I@rv07pG@D
z2a;W*U{}bOt<42xr*mntDZ90Jv}C8?Lp(IA^tgk99qrL%(*sixcy13;FCNZp<JbVX
z5wuQ5@!mOtLmm61VBSbj!K))_no!hp96;Xx9B9#{sIx~gmk6w&l_R*ITydnOHpL{s
ziI&3@{K^TWj&=rJCFz1zr5?vpOm@TYC|5Lhic+_@f_g6>4SG=4jRPG&KkUYwgK*Qg
zb4TU~%__La>fKRR2^ATJeaA3i?AYgHzz{DV%Pe3_E$IPP>Lw3b{!^~%>xqT~QJ?Jd
z1fyf)#e}imXT3nZea6uWloI)194K<OH)zf`Z?GED#)Cl{;=@!B`=r8$mOfMgj`pPu
zdNc)n!L*N>fbI%JT_x{_=08!m)DKLy@e@IFS|`%do>Fj;Kj@Qx{!9<zFk}R97;oNS
z<IN{=Rsjk&O`^2|O61}|TG~^v<Ye0DhFyh4=1xY`LWzt;8chKmTRVl;W+>i1L7>PM
zI!r|;W-1sR>1iPE?bB$~lt`OknmH7Ffeuq5Cxy@yq~Nv?nn((c4+Zl?YdS3lDeA-1
znX$wk92G`O2#UHP4BfB_B^wSyL^$Ft6b20cqQjJx88c|+P_WKSu+5y92`V^t78v3u
zFuWiFTt@#!fF9g3n-(3)c!Nlg$<auV$=Erd-Vf)1j-53Z^vQp7LEh=}Kv;J^jW^}k
zviUT<DR^WQ&4UztD+=ro5eq<{bS^*-AA}MDr_Yv!w6v!@I4m0EeI^=={NxyBvSHre
zVnD|xFQSR0%+Xv7wy>hbAQScyu&v)%Lh~SH&a|Z<_4lPTk(4G$%V-5i!F^+Cno#h*
zSem~nn6sQlO=)szIgOfvC$FIOSPJf30s1>@C1*VlOQ$%_8UTJD$4LX=?eUys03MUT
zNeAFx37kX#-n|Oc#5a+X1W<P-ay9_)!6dK`y01p7lxW7bt_DZBHEX~e)J_JeYm-5d
zlh=YGf3Bq^1Qnh16fl<B>%iKnT1TsSiiz`ju%9%pr!@|oRk%^l-hlM(>E9GAl}bJ+
zr9VT$1$ZD8tOiaR2wzO2wHZpm$?0J2w9{cK?Kf<sxr%}ZZK5%u;3^m%mqDu?in=)i
z<Q=`476yu1Jd@F8@IB<ISF`65z8UBq@<=9!I{HgqhX3v$^<){*LjK()i{mGpemDZB
zvXCXBuJ+$TQ<-w!YZzX#6?C<9HV7AFGvbBcNtz>*ac3T}Gn=D9xaM%Yg;mAjKaoT8
z0HxFTTyXSl&ZRX&iaL^D2<JaS_(_1bc3bnnHZeRO><2INnOzUNcufKGK`Ayze;df8
zVH-GB&)*KVxBi73#s&$#{kl=e*(`kfg+;F20rp?*oy@fqGpXK*TqSxB=e~=4>`yOF
zOx>~zWwQ`9hL`L{g)anSO=S1b+5{Dy0vI;k3+jDgFUTZd9|*sJ;n*T3H4d80egq3y
ziS^DWVbLaTa)8zkD09vo08`WVAd?!4e0vbglhk6+_#q`=`dljk7l_%1(2kU--WsKx
zH~>CT%2^A*Gt0nRyFQ1(A}NDm|8kJYf8}7WNUfmtNXoI=l^}Hm47(m-HZ$z+rX#e)
z2lK{o%ux_lt|GV0=xL7=qJ)Hn9E+K_9V648PL1KmFud?MGucS$+-fF@C1zDKC1Tj@
z1ehilPcVBDrk+(pTaPKf{;C1j`m9>!9D%K|I!QJN5hKr>WHtyaa>^;Df|yCiDe&e!
ztB$ERrnWxK3=Zaf>onND=bZsp46(D!e88<L>n!;Mn7#*%Va<B<T&V}9_Sf(`uczuc
zPQ8D_Zp4=uoI?V7E}`J6b7*B3g0b@M=fRSFbsl`+w&4P}1uA!u31dw*UPM!xs7TFA
zXwOh6Yq2JUmzYd2taBNp-V4J<S7`GKm0C4dm=DFUg3ec&>jRGS{i|dw=&6M@3AqL;
z*m8{neFkb)1Ni9gV*|KHJLfvsQ~q3MGQkQa-$3JqsDjEjnVMkg?3?J>yeReHTTE)~
zlfAc?LBp`sZDxLApPabO><5^7;vL$yFqMN3?vQ$mB>9X+=H4>q{iYH0$%-a$|G2A(
z`2-egvgR&RB<8Jj4~%8zJ#-*IG&Pm()6xf5p<(a)3>Eh|*3yG3u5ajlp8?C2eSkLf
zMNM7%fZYD4djNBiddTsSKG}JV!6g16hdMHhr*X{tI~}IHJO2@Rnt{$6Q@?)%YBK9F
zNd4+Dt>MskKjAPAYh&K6PslR_A|k_^(F`pLKWOGKUa7)NCOt*55T$<jl<6u=J>?k+
zzbN&iXQ0UO&q3a|pM&Ojw16V7wSZQRc>y~191J_Wr1_ib<mi04Pp>%?j6Y*B?iB}p
z#zMrqM_!>_4^f!+8r8EXT=$w*&y;|#6VTD{nXR1Bz$Ld^(aP1s312dsOn3Y_nxr?J
z;nFE`=Nrj%r`WxGL-QO3hfPLO652Rpdf8Da^|6h)AmR9~dQ09DiWuGR9S9$OM=r@C
z)Z^Zh^O6YM_MX=7DJ{`=oV1AC4;)wE60whHiWcRx`6GD4q5p}TnndJQeWIC4$#wtC
z?5WrTH$Q_+LcgH#NR;<~UqDUL+Cfus*zLc9VK4ehyXB&^#4#M-0miVd1GQ*ToqWI1
zgi{f3{f34LQR=ks=w?C)RwdtCv-v^ZEDznx?R{?zckcQh%vAw9&Hq1U&SDscR`Mqp
z+J!$s&q;KG?$7RI#&8n(jV3I>sf!fgr>fZdjV7!r_T0rT<T=qL#qJkvAf=p*<J0yF
zjL*d1V0_SrskDeczd0VlKExVC{b7#O*ejBMY2`r)$F|h`1)XK^53GYD|3DwQchh`G
z3BcBU?S`z&3Fb2H_eo82K60Uh)ahbe#(h0DOIMuB7;!OtM4U?{DWwdyHdKNORsA&y
zE>!iCCArXJhL4h5sOl4>XzDQJ8utO^X7>RF43h>0)JTH@u$O1aaG}<IM}|wacB(RG
z%YqL5BTExNQD?|;p-yBd5BeEbL8CkuS_7OFKmkn(TxboLrwD18tHg!wcQTc@(1AsD
zWiI1bCeD!pWiE8=-#~>+9dD!@a8QNIxUs=Z*s5IU9<C0C-PAw@uc(2V_^E>#*`f{_
zAKjM=4cdSDa-p-G>oq`Ftslr_2Mn9^2lcM%4>EBY0EXf60MN1injr6Inp|iQnyCef
zY}W#LCk*5=CUP9he*-}UQwGue&8TThZLp>bw7Jkb3$}xyWG1c*1`9Mq2P~9!9gupR
zF4#u2^uUZO)C2uxJOt{<#EBuG8{PD|)PoADQf}&Vp`mo50T&wLn+(8M_!}~rXp;lN
zBSVz7NiI*_dItnNrp_FS^nTZP7Eh>@Cf|mFr5tMn)@-*CXwHUVOiggIC>n#wl5I?D
zh?FKqCZJ=Fn}CjWH3dapF{LqKBw~yin2610TxeEaY0hPwq{lY(wcy4JPZStFVZmJu
zz+slqLY->`vM#WKhzWaZ&|TN9LAB@FaN__)6m3E3L$)CGcssB*+w8dL(7DJMzm^Tc
zdK@nFX2p39NIjhkQj7CI>YY50+SwkYer6Bmc=&Lz?pue0)fO=VOy*A`z-GI6B)E3`
z8wqA@vID4~^e8a(vq#Z#oT?vvN6?)8j$kEQIDx!RI)T)#&frRO!x_|jf(sXVa?#`h
zdN9Zp?6YrNX)8OfWYbjC-Ik0-(P_MWVyaco`?I+IevAhFo#X~4vZgzCRv&sEW7b9P
zXuMigbDY82VGOvKG>rilli6cw)jFYwJy?Q#M(}Ga>YFaVqY8SThCL)-22S=saMHc$
zoxLvuV`CINK}T)%q<MsjyPg*_Hs?s*hrCGBGtN8{<~@2GI5XcH2j<jFZ`!&}iTv&j
zhAe43xH`!CFcrjBZuOzn0@bSVOY%0pTxX!uZ1V*tY?}$7P3I?o!Jp;_+Vs(n3q2cI
zKamT)xoGasT>>b0)gSEaGXlWu{2TxVG=378aB6{|f@OhV3-_H&a~0)X`~XdC3Kx2S
zHhT&f)o)X1%2S;1ePUh^xTH!<1#5$t3icd>X&_uV4OGV=7<5`gFgOVYhk#7p!SK>h
zkhjEiFt0aGr#Xfyj{aewIs3wBOei<vbRQEArh7#=I19VY02BD$4BFixB_K2tJ!@P)
zlRFMb*WWY2VP@wn8YjvyZUh&-Tt;u3xQ-u2fNSx}+2C5-KN3vW+DPtNfOo_k+So$b
zgg@XhcrKUm0S~Osnz^(EnYsjT4Gx*dg|-IM=F!4VU4p+TU^Aafy{$+64gcUq>3psO
zptNcfm;gJXz^-n%fYt^nXO}DhsjU}+>vPRQT1rq_qVE!G5i_E>E`XLdqG<&~U4ouw
zL!W<m5(6&j!Hd9|qh%2|7tCA?!fzKdJ-~lRw**B+=m{Z?@9!lv4^Vnn?jpbGvXtuv
zm@2o7*#xkF+-0b|L<N|}(pD2n0DgRQKNfs^<g%RBv8hXNi|kkqw#eWWv=&KSg7OO5
z3yNMzqo$HXGLANEP%se(CZIt)*iLHVLEhd8An%q0kjc_jAQRa{FfVas9ZjTF7Nxv;
z5;%$MNusT16t(?oa8bOwnkJW`p1FqBgDLpan*SZ~yks!ql54?*e#ctcEgTh9+<XF4
zz~*x%h1Ps1oxIn9etx+QY@aFXX$eR%8Mpy-!=VkJi-)JuIs(PydMdaY%uE9V@;eRG
zBr6?+jW&WK+S!d@0()-)^={q-QqRc%jql0;Gj-Eu(BDHcX{kZ^<Y*?SiAxr!$*nBV
z%E&FCCV#eo>*<cIU_DxAgVDK~4Q5qf4(OA%9FTfxE=@ryoAC^(Pk=LIDgn;=>Up4m
z-FaYEaPz@fT+atHZbkvfq^khz4B6X2YLo3?oKJ2CHSsN^IflxL*M%UHr8_{l&rVR2
z?K{CLwAcljb9EOjpo|bL-VKK6(QaCZs7vr5ny?2PMBnV8B`|eK;$Cnl5ZedN#T)lA
zS7E%K4=4hu_ZHDgi4uudVbA?wPd>h%=1}UAaR<22k(0{@z|qC?AZ>i4LVNoln7|>$
z;6(GXm{zNlCRq8+C7|-ZOK8eda<dMB$}5y|{eVGdT`4%ciI;&-w35rX;{hgJBGh<V
zT6vgD{U`-x)3?J+CYX9oIdY89Y=z-Z<><Cq6i%%GVZ%xguCGKl1)@wsk1%1JUw@A<
zLxeS1eH6u6l!<f|TEIo&{V?oyj7z;sje~}Hw;n?`N}^2G9Y<@oC~REKg<j&lS&i<W
zM5*VU02P$0K}9V}on3<l9#L4k7A;?*aA_^N=MaT$Pl9mmNmS3G)b6K1xbYOoWJ(=s
zA)-v4)S-+Og{PfHQ<o_G9}H)n0jW*Sg75_x4ygxo@D~i{o&&?6dmioUi0ZxTJR0^z
zVVw&gyaR@{FM`zDVR+yrbjL3$^2{alluHzzbr~#@4=@~a1*HB7!wFYGYKd!Tu@u!L
z`x?`O_zY*?26Dqp<Vb3E1M2OfO!{93;ezYT#(}GtcLOZ$hd00~j=RaMV%&XnZ_ysS
zP+c>d4pZG;<~DQx4{NgFHfrml=1ATF;iNlg$q=Rf4O7Q9f=s@Nz}TFHP2h_EwF$*i
z^vO`{T_%jp`Fj@>x#AuO|A66``ylnF`=IfW4?ybIFg*PslNuY}{E*8yNQ%t~dc^Ei
znEJ^h=6r#vLmo4EWB4%)PkI7UH^T7vW{~<C47)!Csn3bP*ql+%nC4)({uy%+!RCy3
z4*KNObI^msTR=^Y!?4W@koqtTo4f=QqUa^aMDG;{7mC2xc+J<$;S3v}{TgJV(h9;G
zTR{b--+<J~Fx)KyOKw5GXC2qZ^^(q#iFhR5^Ly5~YkqDcXEs6C#7)VSJUB4(Ef>Aw
zH*(g3-h%_!%13XRH6IX+sHeW;&gf%UT_Y~E57+vR%Q*RriNoKM*(0E)PHeZnhp9i1
zcPJvfKYn1QAm+XBBY86>LM`!$cAG<q-1Le0fDJQI_)Lz-B1|@YrrqabPqn8Y+tF{%
zXc3$*Tq|km(K0^0dwSSi8DE$-;Y;M(kr#u~`%u*5$z8fl?WEi-tPtT{I;F2*=gj_!
zo|KBVcFhhD-qC^lB}%RLjk%O!y?2waXb;!>j`C0xF8mHo)p|crfeVSmymykYC=>1f
znDv9H^Zo;Q5BiB5E5sXDTmDb5e)@KTgJo_fQ$ft6Zx_>pcgZ;CcOjpQSh&WrH^jJp
zG=GuN5t%x6{z6Ztg%rfpM!(62jv~|*znLu(YhwKewDRyDrrwcc5t{uasnf%62v=e8
zUuLCX-e&(mk!Amwym1&zyO}wN;mU5>y>0J3PtIr_W0WO{bI$V^amRW%i}9euydcJ7
z+^k@M-r_vQ@m0*^xj2&=dplBsH%r*tn8^<b9+lm=8{ltAqYaXNM7ku;UHEgCSU{H~
zx<n|Pn9~X=rcD_BDTQnjP4Sg|c+g47e|>n2^IuqGvNVE)L}Hs{WJo6T48kHe%J3Ld
z7p7K|<uTrC!qnSknVMi~T{+}NA>LETzEmtnTKOa~s<$s~A`8Mu9>MW{hPIgXbp2G4
zx=5b%u0z(&UTSQPp#tc^A_X3Fv~Z{*sfoz;V!0v@I!t1%#G_6x_og9nL5W9w50DzH
zeU!oIG%E8Ls}=T%j|vaHUqtseHs`VmGi|YBJym&7hrFbUDniH{%*0I%87~xK3|~@X
zMjpdn>O95=0<g%7>ZF2nSLu>raOjKRB**W=dc%OJkM(65k9qSnQ2GdIf~ikyF!jc;
zLqDX6kVwp=x*yXg7#`jq1x=Lqss7B!V}CmgAYr<{`I-y>D{Z7ElL@A-gW*wHAd^#C
z%*bOV_5;CK9vjH49XZn9_Jhc@_dBL4@b_Q|60I48>M!Z#z2Uu4z%Fss1|xh$8?hGh
z6n4qD!OXJ6`rRAMEO`u1)!{K7abhO-b;!Vr_;rddm=Ddm%$Q-lL-d$^7Q<~KFxDh`
z2*~@}5GHR-y;PrB+nD!H5*G3ahF2JX@NWaAV=?a}LuQR&xYLjsN({#jMNCA~Tg(V$
zkB}yqI?;$(3>fY*;z395Q-?8g4V%-)7zIs~_iAGv^h)6`V~|O#2})Q|Cf`h$j>QSF
z)RZg+k%ahW$_yGdXPFt9-t;bm;s4B-^@n-Knlm$&v@+L%sUNmI-hvrQ%=#Y*i%Lwg
z<OK>H*W(jgGJ}9wFSi0i_8$ogvBuO(tU>AyYfzCTHX!_;4Y+b8+JdPjVF$W3*^cQ}
zY^5X{WU_(H3>h{@p2MsK3}?cy2A3HH97%%9Y{!_%03Op-7|!MKpohlV_9$P45))Gw
z*n^9X?r?C?**TmCy*Xku0yMs41k-rzLCcZM$YZ!{B-rYW9hg~#srNd7VK5p6#`1s&
zjEy&P1bwp45!Bn*i765@DRbg69=l@&?VZ6KJm(B%>}VHeH^EHmU1(vTo>n@!(pnOR
zu_pKoRFf;u0f;kiG^pi;(I{DZst|L+8Rz9jszdKfSiluG<}iohiS9h=!?qL?yt^hg
zhKJto65$jzhQ~PSgXM~kMVv&bbH_5Nv0PIRbZa4+(kDE4jIWLGwe;jMzVeUZ51u^e
z6S-@>K#`i`c+e)$@o_W{P{XgMHwZuT=0UqA%g2LytNYNXsT{zsQ(pGrF<z&{X~XkH
zHuXdmtAo9J$CroRhoi>=!;uqc3x!bl7&q0JI@XUFA`Ht<B-d*JZ%Rv?k|h&SN(!li
zwN&;;*(3@V_>(t`BCDKH07_s{_-FtvfvNk8;gi5JJ2i>M8*4d@yo<u;!e#{WT%`vr
zRVwMdmBRUPGZ5X^^duR+1mCElw*}G7k;so8y`KzD94n_V1A;xE9)wa-)TTp0yk!9F
zF%^W{rt%mkpO2GMZ}K#hsjTZwjlEMZ7O5M|gI<@p5X@T+h@2V1O8{WmP#*M7OKB)C
zS(qAoFmO7L@fks^$-n8mRl?L5-WSGO4Zz;vh>57HzK8Qt0C@Kd9`ve<>r7DZH#0%r
zn`ZG=0K6?CKv&(2U`7WUKYupTq$ho-{8Wkr9eXqq6zM+)H0QrLJjM&@*n>Ofg1lYl
zflONGG4;kwlIMe(7({`_Ux;GnAZ9Xs0qSU?LHoS`<h^|%sNkq*W>#U|@1j9XQe!~g
zCX1LME+vcP&LT3|<lbHqE|QgtL6L?_Km{)^0YyeG1!37`U=Efp1H<4Mi<-Zv2Vcj6
znygz6GBH>IhT+T#FlbX(f{y*Yk_WxwUJysN8If+yjR!Tk7Z3VmSpp+tmG+?M#BLmt
zJ4K7dT3SpPt|Hwq_D`*FoX)J`X^H1-C%(q>d<c_7o|=#*x@4X#c(G&-Pmaa9YR_VM
zunbuI|LCIsyL*(pZY*&nk*6$log~h$?nK0U@o7HIuG6N{#M~sF0zkMU2@(FeNfNr}
zu4d@K=QVsaB01R7Ln2A+g6STUbO*j+I%4D+UVlJb+!{o4L4r_C;kgrEB+O)q5=}0o
zmsT<&w3Fi#hRT`AsmrlQyICwT)&Q1h$ZARY@fv*kL2G&HLc*~#WhQGD?2uux9wI-E
z6LRJ~9eiD`u9?jL6r)4>)p^hInBmea)*5XV%Ug)WQ$i($F<@h%3q!w8KGK)PN?pog
zO%S?bjTK+nGAlDhP%61xV;<OS!T-4T@=!s@UN@45BcYwjlO?vT10#HE9rBE{JB`rJ
z-AtAExSm0cMLDS+n^uK<v`U}Fn$#;w7l|4>fySYEJ#UbZgZY`Fy8(%c4x$kj2ARnb
z85?M#K&!lZMUht3MfP&g0;GhdA}K@X^wKC1S5kR;y@Ytm!9}2&h6o?V5%p=j(V&OP
z6%EIAWj@jHK!U|`My6A`kY@9UvuTWB3_EIlR_?4ORGGVxqsI4YPhPP+2@53lr1J*!
zDn&}d*BuX<*=eA`VjXp1v8D)JSDntkt|i^9KM^&9<}zyG-Lw%|<hPqo_-P}rpO66l
zcTP<=A)16T!g@2$lPE1SlOggqA!eeh_zjXUyPQvGo59l*VurOnq1kadMVZBVY{p{w
z2x;qogn!-R3`VJ8e=bdqRvL)>nc0WM3KY7|ov_->gTDH4dNb1I{zX1lgG?G%Dxl=c
zN8%4IlaaS4e6E;D5jB~hpFWX<)>nE7RsQer$4$^tDn_cRqwEt7<}+e<7I|wxTOzQ5
zp|Z~B(6$Kci4Ke9DP-U`!g&kh(F=BE>9$7k;Zle{k|CU8b#1+RN)at>X3B)=R;E>W
z&=1{;teXCbq-GNpTT!#ND^j2aew_2~Ng9`qUYb6Hb2hK9kOBN!O~^)sf4=hxZ)P)7
z2{Tj3K{TzMB#kA})oCXGf4lMHrtGFVlr8Zb)S!i&vVf?};W-Lvft|9>sx2d1oW;^a
z&UX;HVud^rlZT{kk~dc&cuzROYXrV{nBSCsH^f-13Y2KV7e^}bFOF9-|NlgO*q9YN
z1|S9QOR`vwLJDLO69}I0fmST_&)bd964Y6&ZZ{Tdq|g;o>ZDXFLQCCT>Hl`r&;w@4
zVMuddBzcq&k7gn|k0<<9H=J*U&6@^Hm1VKsqJ?L;&=vW)`QTl2Og@U;I2%H*0I5`$
z&(jt1K4w~CIRE=L<h*al`@&2|5xNB+*Q^4>)!K>g-sS>E&){d4_BKTG&XZ5`*VFue
zXAO3!<a$H(y~r7RkS2CQre*r^d30=}bpxsynr}yPQvCTe`TqYuIrFYAi%38+Udl7m
zWnv(o$LzrWBVG6@w{MK8Dhk?b5nWBE@_9H;HJ2vx3TgF=+ZVp;CeOr+X9N@0yLh(5
zz#Sxw$St_f4rJ6*Of&L-CK2DyPe@XB2+3|V5fWnlzegmj`|)r2SDMt&$`iwES^vWM
zJRa?2bPAj$=hT!fYEXu_=`*jhC#-hSS{>D=$+zdkgA%Zzxx0BPLaB$9VW~~>?1%EX
z!iL3i6$*M;?`4ui`feUH0Ng;ZxWW1!_L79|9$JD>YJ9qk==yHObi}l^nhO6PhrZu~
z=oixH3AepGlV0V88H?Q`ocHmZ{wGUa^1;&DD*b(FXeKy}{wv({txEZ7W!9N15dLFH
zJ86%$u#djovmNsZwNvSlOvme^)t$^Cbg?QiuFPD9FfF2G991HrMaZJ%b-mP@pt_V{
zIB@a}e0tCQP~=h_ib7OTI!L1_S~zkcg6ozwn5z+Qv)Dk(wkRx-y{X7zmD{pd)Fewz
zO}Fpxg}%LGu0g~dpjks{hvzx*7CzzTcgS|-gEYe48O!yyQrRvv$z4ZtxA624PQN(4
z1&KPr<iM}3j|Y+CV&C<Of*sc@(Q|vW`bA{d3nS)zT2*&14^`r6H);-FiXojtYiEzI
zLHb@pcY0%m3Zg}k_*#a7jSWsN0dLXjN|3tULuiEBhrrSaMX<Q05smQ7A=>QRYfx3}
z$uMb@dr?-*+>5fLT^w1;=omP0wdS3X7DxVn+K-v_&7)`+OP298fZ`&fEH;K{FXOp^
z*-Gk&8&jI{7(EX(g3AqKv4VtLc7VK^`dA#5L8}iV^CSKEO58ro)9ICuEhnFO`C2Y}
zYJ#SLEy#e$LR_y2u1oT>P$yQDqZL(jrminX0_Fzt1*lZ;Y=s2yb8B7&qWLtdm!>aq
zvx4D9KA~JCBIM4c5k^&lbJ`)2Fl;f6Q0WLGPjFyHTugAPLsM%GdM+V6=X%A_E<SXG
z5g2~UMpfGLALm}jkoRj#lJUj44=q%PgV!0^%{L|PDAKU0m#!btaul2$^r{fuimiNQ
zqN{jjLjJ>ge3DLhvI%8Y&S)0PuU8LUQjd}zJ^B&G#~9Ba@U?P}a`wujYgbKWIv}Q=
zx^`#1h4lYSJ<O*myhiq(0*kel%VJFw(&6oc9zH{e6~{qelpRMlTD11isSpa)v|6Q-
zbv%N_rQc#YBI&J#8c|-&V2G9BBnRvyP{uw%gN?95BB_h@2&<vA2Q8G8=Icnx!cTlD
z?Y}Mh5g$)5r0_$iTZ0Ji3J6tddHsoXHMC`yEFeB%JxN&5&gX~-jfG;!rwOY?G|$kp
zBC@kp)Y4Yj-WFt8IzsObnzbrXAqwAn7If0Ioqdu~JJ=_Co7L0rp@y8`&0>WJ1?b`L
z-iyr%ol^``@x>CwiYNA?#E$)cbzFH^mBk(gL_k2`$fmLiQX<KAP|pX5h?s_Hq-iCV
zn%yjM0n_V-p?T|CS}rMLB{wt640nApT)_>OjslY6h6{;`qTm9CqW7H{{mwUY4u6=>
zlkace<~QHKP%Eo6fjHEWloS|ZNJhQ^qhptJsAi_+?1p9id^j$3x1o~->sL}tN-)RN
z?zZTmUVDIfzmrDZFMBLmy6xi7v_c$5osf>An_lazA)eJa!hyWD*OIyS2AzK|6pkK*
z?NsD2D_q0j`d*EAnm?=wKko?iZ_@%ryMj9@E-)lDTStt#ZchBCIS{)Iqeg5<8I;<z
zc$@N-9_p+ok&HsatAb}4zvWH(%sn_(F2GtYPEjMJNF+V?No|3r<0t#z?7Chgvk;eQ
zD+We5l7J-m>*hYmUmP8<ALwZVRkV)G->(6hbfsT8ZDO&Z6dB^0bp+>`asWIRGiVa|
z^?;-$OUwEDzwiG*XJs{Kx@=t42r43pTs&y7BmEB=Tm;W>0do#QK(9$EP`sOZX4oNM
zewfKIvkqydfT_B7P)y!u9MiT)!mub*>IVg%LQ1}lloU^Sh+C27ses8-?5MtO5m{`+
z)keZ$i&HM^FeKYA=9r2S1SxEBgfCH!YJ9JDj{;*!>Bb=3MdV`57HYUUN5jE|M7TG@
zVeJ9OtQ}|}o4U$v)qaPDL)eZ8Pje33pW}~0V_ZWqtM(j~+6D8~)3v{k`3@Po2#dWq
zNe++>$LP~r&D9g<ns*FbPn^@}V22*A0!>b3=jGyR3QQIUGsCbi-?aT(N0LQGLyDN8
znvdZD6sawA(<(y?eHo=5ChRYaqGH#%p!4|mG8Fi5Ap}n-#Xu5yD?&$dC(-a}b{|`&
zSHW=@`9=-ZeE<nPVenTmCfz5V0A|u{ig}fsK0&t?Gd`}xz*IcLxZXjmniEKYv9+98
zdToRkNhr~1My0Zqu6aMaQ%8wkA>A9gS=6Ky0uDXHOfEG9soI(>+CwpKJ>-09A4arX
z7D|V<nN`D20=LhE^J>m2r!oLrKjI8C$_&9GGj_|jlVaL6QA{%N`O6TZrjlAschXYw
z>TpN18pmyMB4%5Ys8fc)B=($Cvn;z8Y*-jP2H_tSCDYv?k*B1G_^inp>Ss^73rV%8
z32}Ej(uzyUIWHY;c~zvJh82;{9CP-xgkikDP~zq;!JzvE)8-u^A<Gp`=N1bN3(GAv
zl!eFpyyXGnd?+Sv8&~5gaWZk~5!sf+oso#Sb`#G)yUd;(c;bxYV~+7X3yf6~$3$K*
zxRC{CC8u)WQ3|}%R|7QjKC?G=*s;|8IPP!jA{StOiiY^Q#n2fJ-<aM9_729WQrv^;
zNw4z;3%^F36rYD)n}?{Zk>O^|YsLj&X3bYIZlwBx<~s>11(ua`c3S^RiYpAw--EFn
z8@+d=hYo#cZeM)A5u*B(Ql|<^Fu%-lD|s=*#mZ`WqC7|^t|TQDhF;|7i_%<UbOk=+
zTE2>db~&Ua3z8Z)aJs;akuAkGs&POdT`xa8**e?}vk`Cnt)c`Tziy>Y2_?HPA?VLH
zOXd$>`pSp?k}@0n?HrSM*`TxUmv;TV=dXR3e(#={IwK#v6Of|fwesuF=!(CMYDwPT
z9@(aOC8_gE?1SQmVR+Ye#s@G#$q#mN1sJ0XR}2m$?TV#aqCS1*4IKhkWg~k`$HG7~
zPp`uw2WbVT&Htcc<unpLZio0J3gQp{sUe<FNe8N?HwI7#C;#<b%|YBxdHGsd=>_7?
zNlB$4Np&qd(lFP?{SDWy_?xqy{Wop}2G6e&UAwAb@7Z-|WI1RZqDmh3Rh67bm-~?}
zWO}010$6aiG!E}rk9vwg;Kh40|3(dmwO0*(LdNosdbh7a#VJi1U_up+<ImHC*}~@L
zj4A*l9Y|pf)|8`Fbp6nHi1&~7*MPb0!ZBm7Y08iBVdeK5#TdOPCXPG_j<O^6*QG#n
zW<9S1=;WsY?Md!+OU*a^(cyp=u>Hada?w=;bK>wD1|QLMrb*@v2<RKeF^6wR9g&?h
zs_U%grQ)bI_D%WvZ8GUlP2bd-`QXQDa1HClx%TfB)%+J5ejbUv3H<0G9KZM`^_dyZ
z5{l7};+Rgi<WJ=MMb#}}){NnpqA^izNUW_Qew}6M;*-+m)GE|5=^<ah{bxKU9$6!0
z3hRbHEhrs#1cg_CZ_G(T`hQIdYBUdT*=(mOjkvq_^uN(HCPSL|EMj{WmDH1*%ZhR(
zJ6p?YQsS?3OOujNZH>6c?<0sK_zYfSh$j(uu*;a|>4ZD*L;QbLj6E^lp*`2ksgAV(
zdM_c?4-7qsvLvcC$*7g<i)Q|oJ{>B<VE80-9Pyz=r)8XYWgWJo)MZi9Cqa!wcEH;o
zxB9dPO5F#SWPJsb_I;}|lETkId#MhZ#{EDs31si;C`Xcf7bPRu&VX5W0bH?K1-hH(
z0Izz#djNiv$AMGtp{xXEtl&3OO#clO)0?>6r#CVhi+GLpzYomH-4v5d^6pFJ$x3K;
z(biReV7dG;U(UBJIS(v8;GgpVOiNF3<<^~#a%j$6pLzhlIIjYA7R+UpzJmfQt4L-8
zcG<QK@(8BB>%Y$HUPnZEzS(N&SxxDrpI%`es%Ze*+}re@lgWUG7PIWjhrn!mz$@!A
z!)PWq_oDwlz;u1eF;o7LniZSwq2qrq>3~{lgLAUD`8wH>5@#ijxHjV6Q-~r<iPT2$
zUS!WPo9v@okxh-raxIXx1>;h~06)y2lT+n-+uDi~$33#x&u2b@0DU`-DSf25$}#nP
zm&d?N?8q?-J4V}+agU`^<9WG~0@rupKx3C^%sg9sp%=8JX8kqve8VWXV=Auj#Os&}
zJttn&1UCwFG-QM8Puu*2C@(-y7oS=E++C$}BE{X|r-Ua|MNP`HK$Nd{+w?C;J`2fW
zAhyPH4l5F)TX~0%&dAWdE7NMiO?7+<Q?n9P8cY2;3(x(^x+N^~!TDz6Rr$cP;$=?!
z<SDJYfs>a!e+9@C{bXVKn$a+|o40&=^%%qGmMv5)%=u-<6ug#6=bUF}M%$6c>CsNS
zu*{s%aAb9hGPv^K0yzvLGgS_*#Aq#bc<upf8;JTYTLn6i(Kd>w5HwyND=BcqQVz^s
z8r}RzWtQhV<!rD<h<ccX=)+vjV^IsmQmvG<fE*)E>a?DK!Cq)9E5wRwf-QjlYdA1r
zO|&C9VR6>pBJ+B`15J!*D&k>vMjj^~)>4sPXHo(4SN2OAiX+_^3_vlxV>WU+w=D=t
zeM`mC6bQG2cD0*TpofLF$jmTOpzC%HjMxq{!tJF<F^A5X+tz&uRB4aPPw|ZUMLs9a
zw^!VSz%jbzmusqQV8&&*KwO~W3pm~If@tw!q_$4j(DbWJy4MY99D?2?)?4mA&ZEKs
z!PKTE&#r)0Fs^$M$Glk-tt0cirN+mzYbOQz9p%6tN28sIzazY$-7uLZb(cTZr{E0m
zRkA$D4CN}Z0~uM4-ocJuf=+B;+;)Vh>Pn98<AnM%$INj8ro}amDR)xv98%MRxXQlH
zz;vmmm_#zo89A!emTfC%w$7fiAHM&@?C>>-6gw;8?=@f(U{B_w-`2yv8~B7JzC8P(
zj!H@(>su?P+d$UcSsLBJ|F+rI#4)e9DB`J%Wx|8{efkfO1;a7E#07VV4H@N*a=R$g
z@hJMcg14WYim@kIu2M-d0fE`Jq3<G1W}2N&2Rn0OH#don3mD)A0Xm(EX-U?~Kz592
zTQ{xdAS@_>KgBJpkXY-KL1L7cUuNpS`kF6g9Z&o_=<SG$AO3SuC)XG~qs)!GcWed>
zs6{&!OTH;k<>2xxvCK{o>)Sxog%Ao%CavAk7KDVL$!A~Key1-^pF!}D_~Kh3Bi$8C
ztIQ>LaNX6Fa_vK+ZE3n{B{L&uc>rVAmtuO5Qyz*=Xan5GuAad3A3zE{l_WCWQ)*hw
zz4b*i{-{EW>D^J*u9_5h%Gxo>iC*B^{cY}?<z7l#K{CsVoGVYB3_{RWA+5y*v?Gnu
z#gPbac?dGz>E7U-K7wQBkI*|2qqh__zP(wu1@PQx4tz9P-`c$NS@T%yxvJY^gxA?$
zZewl6a^l*yavNhUBdQ$)Oq!@-@EUuf-co~MdLO6gvdJ7B(q1v$_o!Q$(Kxw5flrFz
z6LGQf$>MZzS$b#E?*+M8G69Y4Az*tp2cFK>TR3vF_OrL$rzzXfe(-bV(zaMhHv1^1
z-_Xh02(z@aFIYFu<E(kMX&WCCznfzISi&*oOQ2_<pA;7s_))E1nfeXf{S_{I#ALr{
z<it;ndN;CDCgwrC;|D2cN>!kJ&PlyErnD`Y#dpfs*d$uMt?<%D+>XDw!ueeCr)^qm
z5wqGa0J6qk<Cw3mA=9!0==9M5nSk7<D~dKFY_(XWM2qroabm|nc~&q10|OzTq*lc^
zlcriE&^Q}O3NO{$6z$PS(c#3<K}sMN+C{;i5AKWPAt6>)al#tAi7f7*q^M;`e|-&`
zEPffQ;v;ZFNB@X7XQ>V(suLs)2*UY6+guo3u2tj#f<v4RO+kFXS^SKyG)QhrY~OPU
zh86ib&ihYYjE;O1OwBdxUbz%F)SCmZ1xpixIld-ne~B|Pc>|m;DsiJ7r<>4G9^_1k
z-#UVKUtf+H<{RTlE<{SNjrlb15dz@vAsl!lB*vTkFGSLoC3R`F%XIoFy%$nI%rW0V
zl(;MDHYmoG1f)on%Np$bOJSD=!I$3n94H<|kCQnsD5l@DrZ$RcPyR{Q!{}I0iI={!
zMsjieUX*7&HT<Q|8Y1e^${le}D!yZOexK8ozer0#6AYGD<@MDSo#E*AC?nCl;Uuz?
z)H0Yu_LuuknE{F!kk(De#H#UBQUtl*NeU@1@TgFDd|L*`WQ8iY7}eApk4hoMd_0+A
z!iiI7B}CAe<>%(WruTNiL*b|f@tt^&HJmPWO^mMj8*y5nR^x(>CgJg{v7gN2jXcg{
zOdj(0QJl1lvOBqJo>#_yPs4+tiU)Ii4GdFWGgmoraN*A0aPSixGsHBgKE%Zpgei_>
z_)AiRn7vO&ts7v3xC*SJVr0L>3r*^$++#7)C7344jEBB#?};=#lzRDXc_eQx<2*jO
z8RJ0yEQ_)HvrBAVETuoBxT6aKlzvks@*?fSQ8u%ed9HiD2=kscFdV?TIu2YOF6OK|
PITEh8*o|+BL38>)u$+lc

diff --git a/backends/metax_gpu/tests/ignore.txt b/backends/metax_gpu/tests/ignore.txt
index 4e54e17b3ef..be0357e5319 100644
--- a/backends/metax_gpu/tests/ignore.txt
+++ b/backends/metax_gpu/tests/ignore.txt
@@ -23,3 +23,10 @@ test_conv3d_transpose_op
 test_conv3d_layer
 test_conv3d_transpose_part2_op
 test_fused_conv2d_add_act_op
+test_swiglu_metax
+test_set_value_op
+test_pad_op
+test_squared_l2_norm_op
+test_concat_op
+test_dygraph_spectral_norm
+test_bincount_op

From 53f4bdeb04b6a2d47a2da4d04718302eb3f6a58b Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Fri, 17 Oct 2025 13:35:00 +0800
Subject: [PATCH 073/121] updata paddle (#110)

* [Metax_change_ut]

* fix sum&collect_fpn_proposals op register

* modify profile

* [Metax] fix paddle bug replace 'MoeGradDispatchKernel' to 'MoeGateDispatchKernel'

* [Metax] register bce_loss_grad & bce_loss & index_add_grad kernels

* [Metax] con2d_grad use gpudnn

* blas handle support

* [Metax] register some kernels & update CMakeLists

* [Metax] fix metax unittest fail

* [Metax] add group_norm & label_smooth kernel and update matmul kernel

* [Metax] fix rmsprop kernel register and add meshgrid & meshgrid_grad kernel register

* add test

* add test

* [test]  chang the logic of workspace_host in cholesky_kernel_register

alloc(cpuplace,size), test pass
alloc(cpuplace, size, stream), crash

* [Metax] fix compile fail

* Revert "[Metax] fix compile fail"

This reverts commit 83bc87f686227962b0262e044225c6ed5507b824.

* [Metax] fix compile fail by 'conv_transpose_grad_kernel_impl.h'

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] con2d_grad use gpudnn

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] change_patch

* [Metax] update unit test CMakeLists.txt

* [Metax] update unit test CMakeLists.txt

* [feature] add unique_consecutive kernel

* [metax] add some kernel

* [metax] add some kernel

* [Metax] register baddbmm kernel & update blas api

* [Metax] register baddbmm kernel & update blas api

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [feature]  add add unique_consecutive kernel.cu

* [fix] fix some test case due to missing op register

* [fix]  fix some fail text

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax]fix lu eigvalshsqueeze rnn kernel

* add and fix some kernels

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [Metax] fix conflict

* [Metax] adapt to paddle-cpu-20250901 & resolve the issue of 'test_elementwise_mul_op_metax' failure

* [Metax] update repeat_interleave kernel & ignore max op test

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax] chang patch fix copy

* [metax] chang patch fix copy

* [Metax] update metax_gpu unit test

* [Metax] fix test CMakeList.txt

* [metax]change_cupti_and_fix_softmax

* [metax]change_patch

* [metax]change_patch

* [metax] updata_qr_kernel

* [metax] updata_qr_kernel

* [Metax] fix cufft and fix some blas kernel apply

* [metax] fix bug

* [Metax] add github action

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]fix_code style and index_elementwise_put_kernel

* [metax]change_build

* [metax]change_build

* change_metax_work

* change_metax_work

* change_metax_work

* change_metax_work

* change_metax_work

* change_warpctc.cmake

* change warpctc.cmake

* test

* change_run_ut

* remove_tets

* test

* add_generate_pb

* [metax]fix paddle bug

* change_ut

* change_ut

* change_ut

* [metax]fix patch and fix missing kernel

* [metax] link mccl and fix missing kernel

* [metax] rename yaml file

* [metax] rm file

* [metax] rm file

* [metax] add Rules

* [metax] change_patch

* update paddle

* [metax] fix dot error

* [metax]rm opt path and fix activation_kernel bug

* updata paddle

* chang_meatx_yaml

* chang_meatx_yaml

* updata_metax

* test

* test

* test

* test

* test

* test

* test

* test

* test

* test

* test

* test

* updata_enigen

* updata_paddle

---------

Co-authored-by: Mingkun.Zhang <2496808993@qq.com>
Co-authored-by: metax666 <metax_pde@outlook.com>
Co-authored-by: jiaxinWang-metax <189149612@qq.com>
Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Co-authored-by: chezhang <1376507468@qq.com>
Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com>
Co-authored-by: ZhouDuan <1184319564@qq.com>
---
 Paddle | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Paddle b/Paddle
index 89f4bd92f49..fd95abaec01 160000
--- a/Paddle
+++ b/Paddle
@@ -1 +1 @@
-Subproject commit 89f4bd92f49e15a9e1803a9e582526b2b8e4557d
+Subproject commit fd95abaec0133b2e2f0ab83684925cd62a18150d

From bf3074e5fdd7962b08aa6673baf42dcb6ca90025 Mon Sep 17 00:00:00 2001
From: duqimeng <1640472053@qq.com>
Date: Fri, 17 Oct 2025 18:16:07 +0800
Subject: [PATCH 074/121] test

---
 Paddle | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Paddle b/Paddle
index fd95abaec01..5dbecdcb0e4 160000
--- a/Paddle
+++ b/Paddle
@@ -1 +1 @@
-Subproject commit fd95abaec0133b2e2f0ab83684925cd62a18150d
+Subproject commit 5dbecdcb0e4ddd3488927f49082dfb66c794f9e7

From 8a54b1d850770680759095280a7c500abcc10c05 Mon Sep 17 00:00:00 2001
From: jxwangmetax <189149612@qq.com>
Date: Mon, 20 Oct 2025 15:07:32 +0800
Subject: [PATCH 075/121] [metax] modify kernels (#117)

* modify kernels

* modify kernels
---
 backends/metax_gpu/CMakeLists.txt             |   24 +-
 .../cuda_kernels/argsort_kernel_register.cu   |    2 +-
 .../cuda_kernels/batch_fc_kernel_register.cu  |    2 +-
 .../matmul_grad_kernel_register.cu            |    2 +-
 .../cuda_kernels/matmul_kernel_register.cu    |    2 +-
 .../cuda_kernels/multihead_matmul_kernel.cu   |    2 +-
 .../kernels/dynload/cupti_lib_path.h          |   19 -
 .../kernels/dynload/dynamic_loader.cc         |  938 -----
 .../kernels/dynload/dynamic_loader.h          |   61 -
 .../kernels/funcs/affine_grid_utils.h         |    2 +-
 backends/metax_gpu/kernels/funcs/blas/blas.cc |   59 -
 backends/metax_gpu/kernels/funcs/blas/blas.h  |  631 ----
 .../kernels/funcs/blas/blas_impl.cu.h         | 3027 -----------------
 .../metax_gpu/kernels/funcs/blas/blas_impl.h  | 2003 -----------
 .../kernels/funcs/blas/blaslt_gemm_search.h   |  794 -----
 .../kernels/funcs/blas/blaslt_impl.cu.h       | 1137 -------
 .../metax_gpu/kernels/funcs/blas/cublas.cc    |   40 -
 .../metax_gpu/kernels/funcs/blas/cublas.h     |  148 -
 .../metax_gpu/kernels/funcs/blas/cublasLt.cc  |   27 -
 .../metax_gpu/kernels/funcs/blas/cublasLt.h   |  115 -
 .../metax_gpu/kernels/funcs/blas/cublaslt.h   |  328 --
 backends/metax_gpu/kernels/funcs/blas/port.cc |  163 -
 backends/metax_gpu/kernels/funcs/blas/port.h  |   61 -
 .../metax_gpu/kernels/funcs/layer_norm_util.h |    2 +-
 .../metax_gpu/kernels/funcs/quant_dequant.h   |  430 ---
 backends/metax_gpu/kernels/gpudnn/cudnn.cc    |   78 -
 backends/metax_gpu/kernels/gpudnn/cudnn.h     |  218 --
 .../kernels/impl/addmm_kernel_impl.h          |    2 +-
 .../kernels/impl/baddbmm_kernel_impl.h        |    2 +-
 .../kernels/impl/bilinear_grad_kernel_impl.h  |    2 +-
 .../kernels/impl/bilinear_kernel_impl.h       |    2 +-
 .../kernels/impl/bmm_grad_kernel_impl.h       |    4 +-
 .../metax_gpu/kernels/impl/bmm_kernel_impl.h  |    2 +-
 .../kernels/impl/cholesky_grad_kernel_impl.h  |    2 +-
 .../impl/cholesky_solve_grad_kernel_impl.h    |    2 +-
 .../kernels/impl/conv_grad_kernel_impl.h      |    2 +-
 .../metax_gpu/kernels/impl/conv_kernel_impl.h |    2 +-
 .../kernels/impl/conv_transpose_kernel_impl.h |    2 +-
 .../impl/deformable_conv_grad_kernel_impl.h   |    2 +-
 backends/metax_gpu/kernels/impl/elementwise.h |    2 +-
 .../kernels/impl/flatten2_kernel_impl.h       |    2 +-
 .../kernels/impl/gru_unit_kernel_impl.h       |    2 +-
 .../kernels/impl/index_select_impl.h          |    2 +-
 .../kernels/impl/inverse_grad_kernel_impl.h   |    2 +-
 .../metax_gpu/kernels/impl/lstm_kernel_impl.h |    2 +-
 .../kernels/impl/lu_grad_kernel_impl.h        |    2 +-
 .../kernels/impl/lu_solve_grad_kernel_impl.h  |    4 +-
 .../kernels/impl/matmul_grad_kernel_impl.h    | 2042 -----------
 .../kernels/impl/matmul_kernel_impl.h         | 1717 ----------
 .../kernels/impl/matmul_kernel_impl_maca.h    | 1696 ---------
 .../kernels/impl/multi_dot_kernel_impl.h      |    2 +-
 .../metax_gpu/kernels/impl/mv_kernel_impl.h   |    2 +-
 .../kernels/impl/solve_grad_kernel_impl.h     |    2 +-
 .../impl/triangular_solve_grad_kernel_impl.h  |    2 +-
 .../batch_fc_grad_kernel_register.cu          |    2 +-
 .../kernels/metax_kernel/block_attn.h         |    2 +-
 .../kernels/metax_kernel/elementwise.h        |    2 +-
 .../kernels/metax_kernel/metax_context.h      |    4 +-
 .../metax_kernel/mv_grad_kernel_register.cu   |    2 +-
 .../kernels/metax_kernel/quant_dequant.h      |    2 +-
 .../rank_attention_grad_kernel_register.cu    |    4 +-
 .../rank_attention_kernel_register.cu         |    4 +-
 .../slogdeterminant_kernel_register.cu        |    2 +-
 .../triangular_solve_kernel_register.cu       |    2 +-
 backends/metax_gpu/patch/paddle.patch         |  487 +--
 backends/metax_gpu/runtime/runtime.cc         |    2 +-
 66 files changed, 210 insertions(+), 16127 deletions(-)
 delete mode 100644 backends/metax_gpu/kernels/dynload/cupti_lib_path.h
 delete mode 100644 backends/metax_gpu/kernels/dynload/dynamic_loader.cc
 delete mode 100644 backends/metax_gpu/kernels/dynload/dynamic_loader.h
 delete mode 100644 backends/metax_gpu/kernels/funcs/blas/blas.cc
 delete mode 100644 backends/metax_gpu/kernels/funcs/blas/blas.h
 delete mode 100644 backends/metax_gpu/kernels/funcs/blas/blas_impl.cu.h
 delete mode 100644 backends/metax_gpu/kernels/funcs/blas/blas_impl.h
 delete mode 100644 backends/metax_gpu/kernels/funcs/blas/blaslt_gemm_search.h
 delete mode 100755 backends/metax_gpu/kernels/funcs/blas/blaslt_impl.cu.h
 delete mode 100644 backends/metax_gpu/kernels/funcs/blas/cublas.cc
 delete mode 100755 backends/metax_gpu/kernels/funcs/blas/cublas.h
 delete mode 100644 backends/metax_gpu/kernels/funcs/blas/cublasLt.cc
 delete mode 100644 backends/metax_gpu/kernels/funcs/blas/cublasLt.h
 delete mode 100755 backends/metax_gpu/kernels/funcs/blas/cublaslt.h
 delete mode 100644 backends/metax_gpu/kernels/funcs/blas/port.cc
 delete mode 100644 backends/metax_gpu/kernels/funcs/blas/port.h
 delete mode 100644 backends/metax_gpu/kernels/funcs/quant_dequant.h
 delete mode 100644 backends/metax_gpu/kernels/gpudnn/cudnn.cc
 delete mode 100644 backends/metax_gpu/kernels/gpudnn/cudnn.h
 delete mode 100644 backends/metax_gpu/kernels/impl/matmul_grad_kernel_impl.h
 delete mode 100755 backends/metax_gpu/kernels/impl/matmul_kernel_impl.h
 delete mode 100644 backends/metax_gpu/kernels/impl/matmul_kernel_impl_maca.h

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index 6aecdc1f833..9e257e9507d 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -109,6 +109,10 @@ file(
   CUDA_SRCS
   # backends
   ${PADDLE_SOURCE_DIR}/paddle/phi/backends/gpu/cuda/cuda_info.cc
+  ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/dynamic_loader.cc
+  ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/cublas.cc
+  ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/cublasLt.cc
+  ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/cudnn.cc
   ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/cuda_driver.cc
   ${PADDLE_SOURCE_DIR}/paddle/phi/backends/gpu/cuda/cuda_graph.cc
   # Core
@@ -698,7 +702,6 @@ file(
   kernels/gpudnn/*.cu
   kernels/cuda_kernels/*.cc
   kernels/cuda_kernels/*.cu
-  kernels/funcs/blas/*.cc
   kernels/ernie_core/*.cu)
 
 set(CUSTOM_DEVICE_SRCS ${CUDA_SRCS} ${CC_SRCS} ${ERNIE_CORE_SRCS})
@@ -746,11 +749,28 @@ target_compile_definitions(
   PUBLIC PADDLE_WITH_CUDA=1
          PADDLE_WITH_CUSTOM_DEVICE=1
          mcblasContext=cublasContext
+         cublasLtContext=mcblasLtContext
          GPUContext=CustomContext
          KPSContext=CustomContext
          STREAM_TYPE=cudaStream_t
          EVENT_TYPE=cudaEvent_t
-         EIGEN_USE_GPU=1)
+         EIGEN_USE_GPU=1
+         CUDA_LIB_NAME="libmcruntime.so"
+         BLAS_LIB_NAME="libmcblas.so"
+         BLASLT_LIB_NAME="libmcblasLt.so"
+         DNN_LIB_NAME="libmcdnn.so"
+         PTI_LIB_NAME="libmcpti.so"
+         RAND_LIB_NAME="libcurand.so"
+         JPEG_LIB_NAME="libnvjpeg.so"
+         SOLVER_LIB_NAME="libmcsolver.so"
+         SPARSE_LIB_NAME="libmcsparse.so"
+         RTC_LIB_NAME="libmcruntime.so"
+         FLASHATTN_LIB_NAME="libmcFlashAttn.so"
+         FLASHATTNV3_LIB_NAME="libflashattnv3.so"
+         CCL_LIB_NAME="libmccl.so"
+         FFT_LIB_NAME="libcufft.so"
+         SPARSELT_LIB_NAME="libcusparseLt.so"
+         CUPTI_LIB_PATH="/root/cu-bridge/CUDA_DIR/extras/CUPTI/lib64")
 
 # packing wheel package
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
diff --git a/backends/metax_gpu/kernels/cuda_kernels/argsort_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/argsort_kernel_register.cu
index 8fb331eeedd..20ea33834e6 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/argsort_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/argsort_kernel_register.cu
@@ -26,11 +26,11 @@
 namespace cub = hipcub;
 #endif
 
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/primitive/functor_primitives.h"
 #include "paddle/phi/kernels/transpose_kernel.h"
diff --git a/backends/metax_gpu/kernels/cuda_kernels/batch_fc_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/batch_fc_kernel_register.cu
index caccb01f71d..0e82304d31d 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/batch_fc_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/batch_fc_kernel_register.cu
@@ -14,10 +14,10 @@
 
 #include <string>
 
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 
 namespace phi {
diff --git a/backends/metax_gpu/kernels/cuda_kernels/matmul_grad_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/matmul_grad_kernel_register.cu
index f9eef9908ab..bb3b07d24d0 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/matmul_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/matmul_grad_kernel_register.cu
@@ -13,9 +13,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "../impl/matmul_grad_kernel_impl.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/matmul_grad_kernel_impl.h"
 #include "paddle/phi/kernels/matmul_grad_kernel.h"
 
 PD_CUSTOM_KERNEL_REGISTER(matmul_grad,
diff --git a/backends/metax_gpu/kernels/cuda_kernels/matmul_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/matmul_kernel_register.cu
index 57c3a85b1ea..750cf2a9f36 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/matmul_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/matmul_kernel_register.cu
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "kernels/impl/matmul_kernel_impl.h"
+#include "paddle/phi/kernels/impl/matmul_kernel_impl.h"
 
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
diff --git a/backends/metax_gpu/kernels/cuda_kernels/multihead_matmul_kernel.cu b/backends/metax_gpu/kernels/cuda_kernels/multihead_matmul_kernel.cu
index 151c929e41c..998854140fc 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/multihead_matmul_kernel.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/multihead_matmul_kernel.cu
@@ -15,11 +15,11 @@
 #include <algorithm>
 #include <type_traits>
 
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/common/errors.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/multihead_matmul_functor.h"
 
 namespace phi {
diff --git a/backends/metax_gpu/kernels/dynload/cupti_lib_path.h b/backends/metax_gpu/kernels/dynload/cupti_lib_path.h
deleted file mode 100644
index 6082fffd60e..00000000000
--- a/backends/metax_gpu/kernels/dynload/cupti_lib_path.h
+++ /dev/null
@@ -1,19 +0,0 @@
-// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights
-// Reserved.
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#define CUPTI_LIB_PATH "/root/cu-bridge/CUDA_DIR/extras/CUPTI/lib64"
diff --git a/backends/metax_gpu/kernels/dynload/dynamic_loader.cc b/backends/metax_gpu/kernels/dynload/dynamic_loader.cc
deleted file mode 100644
index a23b7fa2aff..00000000000
--- a/backends/metax_gpu/kernels/dynload/dynamic_loader.cc
+++ /dev/null
@@ -1,938 +0,0 @@
-// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights
-// Reserved.
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-// #include "paddle/phi/backends/dynload/dynamic_loader.h"
-#include "kernels/dynload/dynamic_loader.h"
-
-#include <dirent.h>
-
-#include <codecvt>
-#include <cstdlib>
-#include <string>
-#include <vector>
-// #include "paddle/phi/backends/dynload/cupti_lib_path.h"
-#include "./dynload/cupti_lib_path.h"
-#include "paddle/phi/common/port.h"
-#include "paddle/phi/core/enforce.h"
-
-#if defined(_WIN32)
-#include <windows.h>
-#endif
-
-// TODO(wilber): The phi computing library requires a component to manage flags
-// (maybe not use gflags).
-#include "glog/logging.h"
-#include "paddle/common/flags.h"
-
-COMMON_DECLARE_string(cudnn_dir);
-COMMON_DECLARE_string(cuda_dir);
-COMMON_DECLARE_string(cublas_dir);
-COMMON_DECLARE_string(nccl_dir);
-COMMON_DECLARE_string(cupti_dir);
-COMMON_DECLARE_string(tensorrt_dir);
-COMMON_DECLARE_string(mklml_dir);
-COMMON_DECLARE_string(lapack_dir);
-COMMON_DECLARE_string(mkl_dir);
-COMMON_DECLARE_string(op_dir);
-COMMON_DECLARE_string(cusparselt_dir);
-COMMON_DECLARE_string(curand_dir);
-COMMON_DECLARE_string(cusolver_dir);
-COMMON_DECLARE_string(cusparse_dir);
-COMMON_DECLARE_string(win_cuda_bin_dir);
-#ifdef PADDLE_WITH_HIP
-
-PHI_DEFINE_string(miopen_dir,
-                  "",
-                  "Specify path for loading libMIOpen.so. For instance, "
-                  "/opt/rocm/miopen/lib. If empty [default], dlopen "
-                  "will search miopen from LD_LIBRARY_PATH");
-
-PHI_DEFINE_string(rocm_dir,
-                  "",
-                  "Specify path for loading rocm library, such as librocblas, "
-                  "libmiopen, libhipsparse. For instance, /opt/rocm/lib. "
-                  "If default, dlopen will search rocm from LD_LIBRARY_PATH");
-
-PHI_DEFINE_string(rccl_dir,
-                  "",
-                  "Specify path for loading rccl library, such as librccl.so. "
-                  "For instance, /opt/rocm/rccl/lib. If default, "
-                  "dlopen will search rccl from LD_LIBRARY_PATH");
-#endif
-
-// #ifdef PADDLE_WITH_FLAGCX
-// COMMON_DECLARE_string(flagcx_dir);
-// #endif
-
-// PHI_DEFINE_EXPORTED_string(
-//     flagcx_dir,  // NOLINT
-//     "",
-//     "Specify path for loading libflagcx.so. For instance, "
-//     "For instance, /usr/local/flagcx/lib. If default, "
-//     "dlopen will search flagcx from LD_LIBRARY_PATH");
-
-#ifdef PADDLE_WITH_XPU
-PD_DEFINE_string(xpti_dir, "", "Specify path for loading libxpti.so.");
-#endif
-
-namespace phi::dynload {
-
-struct PathNode {
-  PathNode() = default;
-  std::string path = "";
-};
-
-static constexpr char cupti_lib_path[] = CUPTI_LIB_PATH;  // NOLINT
-
-// NOTE: In order to adapt to the default installation path of cuda
-#if defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-static constexpr char cuda_lib_path[] = CUDA_TOOLKIT_ROOT_DIR "/bin";
-#else
-static constexpr char cuda_lib_path[] = "/usr/local/cuda/lib64";  // NOLINT
-#endif
-
-static PathNode s_py_site_pkg_path;
-
-#if defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-static constexpr char* win_cudnn_lib = "cudnn64_" CUDNN_MAJOR_VERSION ".dll";
-static constexpr char* win_cublas_lib =
-    "cublas64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
-    ".dll;cublas64_" CUDA_VERSION_MAJOR ".dll";
-#if CUDA_VERSION >= 11000
-static constexpr char* win_curand_lib =
-    "curand64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
-    ".dll;curand64_" CUDA_VERSION_MAJOR ".dll;curand64_10.dll";
-static constexpr char* win_nvjpeg_lib =
-    "nvjpeg64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
-    ".dll;nvjpeg64_" CUDA_VERSION_MAJOR ".dll;nvjpeg64_10.dll";
-static constexpr char* win_cusolver_lib =
-    "cusolver64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
-    ".dll;cusolver64_" CUDA_VERSION_MAJOR
-    ".dll;cusolver64_11.dll;cusolver64_10.dll";
-static constexpr char* win_cusparse_lib =
-    "cusparse64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
-    ".dll;cusparse64_" CUDA_VERSION_MAJOR ".dll;cusparse64_10.dll";
-static constexpr char* win_cufft_lib =
-    "cufft64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
-    ".dll;cufft64_" CUDA_VERSION_MAJOR ".dll;cufft64_11.dll;cufft64_10.dll";
-#else
-static constexpr char* win_curand_lib =
-    "curand64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
-    ".dll;curand64_" CUDA_VERSION_MAJOR ".dll";
-static constexpr char* win_nvjpeg_lib =
-    "nvjpeg64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
-    ".dll;nvjpeg64_" CUDA_VERSION_MAJOR ".dll";
-static constexpr char* win_cusolver_lib =
-    "cusolver64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
-    ".dll;cusolver64_" CUDA_VERSION_MAJOR ".dll";
-static constexpr char* win_cusparse_lib =
-    "cusparse64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
-    ".dll;cusparse64_" CUDA_VERSION_MAJOR ".dll";
-static constexpr char* win_cufft_lib =
-    "cufft64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
-    ".dll;cufft64_" CUDA_VERSION_MAJOR ".dll";
-#endif  // CUDA_VERSION
-#endif
-
-static inline std::string join(const std::string& part1,
-                               const std::string& part2) {
-// directory separator
-#if defined(_WIN32)
-  const char sep = '\\';
-#else
-  const char sep = '/';
-#endif
-  if (!part2.empty() && part2.front() == sep) {
-    return part2;
-  }
-  std::string ret;
-  ret.reserve(part1.size() + part2.size() + 1);
-  ret = part1;
-  if (!ret.empty() && ret.back() != sep) {
-    ret += sep;
-  }
-  ret += part2;
-  return ret;
-}
-
-static inline std::vector<std::string> split(
-    const std::string& str, const std::string separator = " ") {
-  std::vector<std::string> str_list;
-  std::string::size_type firstPos = 0;
-  firstPos = str.find_first_not_of(separator, 0);
-  std::string::size_type lastPos = 0;
-  lastPos = str.find_first_of(separator, firstPos);
-  while (std::string::npos != firstPos && std::string::npos != lastPos) {
-    str_list.push_back(str.substr(firstPos, lastPos - firstPos));
-    firstPos = str.find_first_not_of(separator, lastPos);
-    lastPos = str.find_first_of(separator, firstPos);
-  }
-  if (std::string::npos == lastPos) {
-    str_list.push_back(str.substr(firstPos, lastPos - firstPos));
-  }
-  return str_list;
-}
-
-void SetPaddleLibPath(const std::string& py_site_pkg_path) {
-  s_py_site_pkg_path.path = py_site_pkg_path;
-  VLOG(3) << "Set paddle lib path : " << py_site_pkg_path;
-}
-
-static inline void* GetDsoHandleFromSpecificPath(const std::string& spec_path,
-                                                 const std::string& dso_name,
-                                                 int dynload_flags) {
-  void* dso_handle = nullptr;
-  if (!spec_path.empty()) {
-    // search xxx.so from custom path
-    VLOG(3) << "Try to find library: " << dso_name
-            << " from specific path: " << spec_path;
-    std::string dso_path = join(spec_path, dso_name);
-    dso_handle = dlopen(dso_path.c_str(), dynload_flags);
-  }
-  return dso_handle;
-}
-
-static inline std::string FindLibAbsolutePath(const std::string& directory,
-                                              const std::string& filename) {
-  DIR* dir = opendir(directory.c_str());
-  struct dirent* ent;
-
-  if (dir != nullptr) {
-    while ((ent = readdir(dir)) != nullptr) {
-      if (ent->d_type == DT_REG || ent->d_type == DT_LNK) {
-        if (filename == std::string(ent->d_name)) {
-          closedir(dir);
-          return join(directory, ent->d_name);
-        }
-      } else if (ent->d_type == DT_DIR) {
-        if (strcmp(ent->d_name, ".") != 0 && strcmp(ent->d_name, "..") != 0) {
-          std::string res =
-              FindLibAbsolutePath(join(directory, ent->d_name) + "/", filename);
-          if (!res.empty()) {
-            closedir(dir);
-            return res;
-          }
-        }
-      }
-    }
-    closedir(dir);
-  }
-  return "";
-}
-
-static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path,
-                                                int dynload_flags) {
-  // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
-  // and /usr/local/lib path
-  void* dso_handle = dlopen(dso_path.c_str(), dynload_flags);
-  VLOG(3) << "Try to find library: " << dso_path
-          << " from default system path.";
-
-// TODO(chenweihang): This path is used to search which libs?
-// DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
-// bring System Integrity Projection (SIP), if dso_handle
-// is null, search from default package path in Mac OS.
-#if defined(__APPLE__) || defined(__OSX__)
-#if defined(__arm__) || defined(__aarch64__)
-  if (nullptr == dso_handle) {
-    dso_handle =
-        dlopen(FindLibAbsolutePath("/opt/homebrew/Cellar/", dso_path).c_str(),
-               dynload_flags);
-  }
-#else
-  if (nullptr == dso_handle) {
-    dso_handle =
-        dlopen(FindLibAbsolutePath("/usr/local/cuda/lib/", dso_path).c_str(),
-               dynload_flags);
-  }
-#endif
-#endif
-
-  return dso_handle;
-}
-
-/*
- * We define three priorities for dynamic library search:
- *
- * First: Search for  path specified by the user
- * Second: Search the stheystem default path
- * Third: Search for a special path corresponding to
- *        a specific library to adapt to changes and easy to expand.
- */
-
-static inline void* GetDsoHandleFromSearchPath(
-    const std::string& config_path,
-    const std::string& dso_name,
-    bool throw_on_error = true,
-    const std::vector<std::string>& extra_paths = std::vector<std::string>(),
-    const std::string& warning_msg = std::string()) {
-#if !defined(_WIN32)
-  int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
-#else
-  int dynload_flags = 0;
-#endif  // !_WIN32
-#if defined(_WIN32)
-  std::vector<std::wstring> cuda_bin_search_path = {
-      L"cublas",
-      L"cuda_nvrtc",
-      L"cuda_runtime",
-      L"cudnn",
-      L"cufft",
-      L"curand",
-      L"cusolver",
-      L"cusparse",
-      L"nvjitlink",
-  };
-  for (auto search_path : cuda_bin_search_path) {
-    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
-    std::wstring win_path_wstring =
-        converter.from_bytes(FLAGS_win_cuda_bin_dir);
-    search_path = win_path_wstring + L"\\" + search_path + L"\\bin";
-    AddDllDirectory(search_path.c_str());
-  }
-#endif
-  std::vector<std::string> dso_names = split(dso_name, ";");
-  void* dso_handle = nullptr;
-  for (auto const& dso : dso_names) {
-    // 1. search in user config path by FLAGS
-    dso_handle = GetDsoHandleFromSpecificPath(config_path, dso, dynload_flags);
-    // 2. search in system default path
-    if (nullptr == dso_handle) {
-      dso_handle = GetDsoHandleFromDefaultPath(dso, dynload_flags);
-    }
-    // 3. search in extra paths
-    if (nullptr == dso_handle) {
-      for (auto const& path : extra_paths) {
-        VLOG(3) << "extra_paths: " << path;
-        dso_handle = GetDsoHandleFromSpecificPath(path, dso, dynload_flags);
-      }
-    }
-    if (nullptr != dso_handle) break;
-  }
-
-  // 4. [If Failed for All dso_names] logging warning if exists
-  if (nullptr == dso_handle && !warning_msg.empty()) {
-    LOG(WARNING) << warning_msg;
-  }
-
-  // 5. [If Failed for All dso_names] logging or throw error info
-  if (nullptr == dso_handle) {
-    auto error_msg =
-        "The third-party dynamic library (%s) that Paddle depends on is not "
-        "configured correctly. (error code is %s)\n"
-        "  Suggestions:\n"
-        "  1. Check if the third-party dynamic library (e.g. CUDA, CUDNN) "
-        "is installed correctly and its version is matched with paddlepaddle "
-        "you installed.\n"
-        "  2. Configure third-party dynamic library environment variables as "
-        "follows:\n"
-        "  - Linux: set LD_LIBRARY_PATH by `export LD_LIBRARY_PATH=...`\n"
-        "  - Windows: set PATH by `set PATH=XXX;%%PATH%%`\n"
-        "  - Mac: set  DYLD_LIBRARY_PATH by `export DYLD_LIBRARY_PATH=...` "
-        "[Note: After Mac OS 10.11, using the DYLD_LIBRARY_PATH is "
-        "impossible unless System Integrity Protection (SIP) is disabled.]";
-#if !defined(_WIN32)
-    auto errorno = dlerror();
-#else
-    auto errorno = GetLastError();
-#endif  // !_WIN32
-    if (throw_on_error) {
-      // NOTE: Special error report case, no need to change its format
-      PADDLE_THROW(
-          common::errors::PreconditionNotMet(error_msg, dso_name, errorno));
-    } else {
-      LOG(WARNING) << paddle::string::Sprintf(error_msg, dso_name, errorno);
-    }
-  }
-
-  return dso_handle;
-}
-
-void* GetCublasDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib");
-#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
-#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
-    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cublas64_11.dll");
-#else
-    return GetDsoHandleFromSearchPath(
-        FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path});
-#endif
-  } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 13000) {
-#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
-    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cublas64_12.dll");
-#else
-    return GetDsoHandleFromSearchPath(
-        FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path});
-#endif
-  } else {
-    std::string warning_msg(
-        "Your CUDA_VERSION is less than 11 or greater than 13, paddle "
-        "temporarily no longer supports");
-    return nullptr;
-  }
-
-#elif defined(__linux__) && defined(PADDLE_WITH_CUDA)
-  if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
-#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
-    return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublas.so.11");
-#else
-    // VLOG(0) << "dynload:libmcblas.so: CUDA_VERSION=11000-12000 start" ;
-    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libmcblas.so");
-    // VLOG(0) << "dynload:libmcblas.so: CUDA_VERSION=11000-12000 end" ;
-#endif
-  } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 13000) {
-#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
-    return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublas.so.12");
-#else
-    // VLOG(0) << "dynload:libmcblas.so: CUDA_VERSION=12000-13000 start" ;
-    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libmcblas.so");
-    // VLOG(0) << "dynload:libmcblas.so: CUDA_VERSION=12000-13000 end" ;
-#endif
-  } else {
-    std::string warning_msg(
-        "Your CUDA_VERSION is less than 11 or greater than 13, paddle "
-        "temporarily no longer supports");
-    return nullptr;
-  }
-#elif defined(PADDLE_WITH_HIP)
-  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocblas.so");
-#else
-  // VLOG(0) << "dynload:libmcblas.so: CUDA_VERSION=else start" ;
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libmcblas.so");
-  // VLOG(0) << "dynload:libmcblas.so: CUDA_VERSION=else end" ;
-//   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libmcblas.so");
-#endif
-}
-
-void* GetCublasLtDsoHandle() {
-// APIs available after CUDA 10.1
-#if defined(__linux__) && defined(PADDLE_WITH_CUDA)
-  if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
-#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
-    return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublasLt.so.11");
-#else
-    // return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublasLt.so");
-    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libmcblasLt.so");
-#endif
-  } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 13000) {
-#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
-    return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublasLt.so.12");
-#else
-    // return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublasLt.so");
-    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libmcblasLt.so");
-#endif
-  } else {
-    std::string warning_msg(
-        "Your CUDA_VERSION is less than 11 or greater than 13, paddle "
-        "temporarily no longer supports");
-    return nullptr;
-  }
-#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
-#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
-    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cublasLt64_11.dll");
-#else
-    return GetDsoHandleFromSearchPath(
-        FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path});
-#endif
-  } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 13000) {
-#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
-    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cublasLt64_12.dll");
-#else
-    return GetDsoHandleFromSearchPath(
-        FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path});
-#endif
-  } else {
-    std::string warning_msg(
-        "Your CUDA_VERSION is less than 11 or greater than 12, paddle "
-        "temporarily no longer supports");
-    return nullptr;
-  }
-#elif !defined(__linux__) && defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 10010
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublasLt.so");
-#elif defined(PADDLE_WITH_HIP)
-  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhipblaslt.so");
-#else
-  std::string warning_msg(
-      "Your CUDA_VERSION less 10.1, not support CublasLt. "
-      "If you want to use CublasLt, please upgrade CUDA and rebuild "
-      "PaddlePaddle.");
-  return nullptr;
-#endif
-}
-
-void* GetCUDNNDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  std::string mac_warn_meg(
-      "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n "
-      "For instance, sudo tar -xzf "
-      "cudnn-7.5-osx-x64-v5.0-ga.tgz -C /usr/local \n sudo "
-      "chmod a+r /usr/local/cuda/include/cudnn.h "
-      "/usr/local/cuda/lib/libcudnn*");
-  return GetDsoHandleFromSearchPath(
-      FLAGS_cudnn_dir, "libcudnn.dylib", false, {}, mac_warn_meg);
-#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  std::string win_warn_meg(
-      "Note: [Recommend] copy cudnn into CUDA installation directory. \n "
-      "For instance, download cudnn-10.0-windows10-x64-v7.6.5.32.zip from "
-      "NVIDIA's official website, \n"
-      "then, unzip it and copy it into C:\\Program Files\\NVIDIA GPU Computing "
-      "Toolkit\\CUDA\\v10.0\n"
-      "You should do this according to your CUDA installation directory and "
-      "CUDNN version.");
-  if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12030) {
-#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
-    return GetDsoHandleFromSearchPath(
-        FLAGS_cuda_dir, "cudnn64_8.dll", true, {cuda_lib_path}, win_warn_meg);
-#else
-    return GetDsoHandleFromSearchPath(
-        FLAGS_cuda_dir, win_cudnn_lib, true, {cuda_lib_path}, win_warn_meg);
-#endif
-  } else if (CUDA_VERSION >= 12030) {
-#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
-    return GetDsoHandleFromSearchPath(
-        FLAGS_cuda_dir, "cudnn64_9.dll", true, {cuda_lib_path}, win_warn_meg);
-#else
-    return GetDsoHandleFromSearchPath(
-        FLAGS_cuda_dir, win_cudnn_lib, true, {cuda_lib_path}, win_warn_meg);
-#endif
-  }
-#elif defined(PADDLE_WITH_HIP)
-  return GetDsoHandleFromSearchPath(FLAGS_miopen_dir, "libMIOpen.so", false);
-#else
-#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
-  if (CUDA_VERSION >= 12030) {
-    return GetDsoHandleFromSearchPath(
-        FLAGS_cudnn_dir, "libcudnn.so.9", false, {cuda_lib_path});
-  } else {
-    return GetDsoHandleFromSearchPath(
-        FLAGS_cudnn_dir, "libcudnn.so.8", false, {cuda_lib_path});
-  }
-#else
-  return GetDsoHandleFromSearchPath(
-      FLAGS_cudnn_dir, "libmcdnn.so", false, {cuda_lib_path});
-#endif
-#endif
-}
-
-void* GetCUPTIDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(
-      FLAGS_cupti_dir, "libcupti.dylib", false, {cupti_lib_path});
-#elif defined(__linux__) && defined(PADDLE_WITH_CUDA)
-  if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
-#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
-    return GetDsoHandleFromSearchPath(
-        FLAGS_cupti_dir, "libcupti.so.11.8", false, {cupti_lib_path});
-#else
-    return GetDsoHandleFromSearchPath(
-        FLAGS_cupti_dir, "libmcpti.so", false, {cupti_lib_path});
-#endif
-
-  } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 13000) {
-#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
-    return GetDsoHandleFromSearchPath(
-        FLAGS_cupti_dir, "libcupti.so.12", false, {cupti_lib_path});
-#else
-    return GetDsoHandleFromSearchPath(
-        FLAGS_cupti_dir, "libmcpti.so", false, {cupti_lib_path});
-#endif
-  } else {
-    std::string warning_msg(
-        "Your CUDA_VERSION is less than 11 or greater than 13, paddle "
-        "temporarily no longer supports");
-    return nullptr;
-  }
-#else
-  return GetDsoHandleFromSearchPath(
-      FLAGS_cupti_dir, "libmcpti.so", false, {cupti_lib_path});
-#endif
-}
-
-void* GetCurandDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib");
-#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
-  return GetDsoHandleFromSearchPath(
-      FLAGS_cuda_dir, "curand64_10.dll", true, {cuda_lib_path});
-#else
-  return GetDsoHandleFromSearchPath(
-      FLAGS_cuda_dir, win_curand_lib, true, {cuda_lib_path});
-#endif
-#elif defined(PADDLE_WITH_HIP)
-  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhiprand.so");
-#else
-#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
-  return GetDsoHandleFromSearchPath(FLAGS_curand_dir, "libcurand.so.10");
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_curand_dir, "libcurand.so");
-#endif
-
-#endif
-}
-
-#ifdef PADDLE_WITH_HIP
-void* GetROCFFTDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocfft.dylib");
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhipfft.so");
-#endif
-}
-#endif
-
-void* GetNvjpegDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvjpeg.dylib");
-#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  return GetDsoHandleFromSearchPath(
-      FLAGS_cuda_dir, win_nvjpeg_lib, true, {cuda_lib_path});
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvjpeg.so");
-#endif
-}
-
-void* GetCusolverDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.dylib");
-#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
-  return GetDsoHandleFromSearchPath(
-      FLAGS_cuda_dir, "cusolver64_11.dll", true, {cuda_lib_path});
-#else
-  return GetDsoHandleFromSearchPath(
-      FLAGS_cuda_dir, win_cusolver_lib, true, {cuda_lib_path});
-#endif
-#elif defined(PADDLE_WITH_HIP)
-  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocsolver.so");
-#else
-#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.so.11");
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libmcsolver.so");
-#endif
-#endif
-}
-
-void* GetCusparseDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusparse.dylib");
-#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
-#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
-    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cusparse64_11.dll");
-#else
-    return GetDsoHandleFromSearchPath(
-        FLAGS_cuda_dir, win_cusparse_lib, true, {cuda_lib_path});
-#endif
-  } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 13000) {
-#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
-    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cusparse64_12.dll");
-#else
-    return GetDsoHandleFromSearchPath(
-        FLAGS_cuda_dir, win_cusparse_lib, true, {cuda_lib_path});
-#endif
-  } else {
-    std::string warning_msg(
-        "Your CUDA_VERSION is less than 11 or greater than 13, paddle "
-        "temporarily no longer supports");
-    return nullptr;
-  }
-#elif defined(__linux__) && defined(PADDLE_WITH_CUDA)
-  if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
-#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
-    return GetDsoHandleFromSearchPath(FLAGS_cusparse_dir, "libcusparse.so.11");
-#else
-    return GetDsoHandleFromSearchPath(FLAGS_cusparse_dir, "libmcsparse.so");
-#endif
-  } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 13000) {
-#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
-    return GetDsoHandleFromSearchPath(FLAGS_cusparse_dir, "libcusparse.so.12");
-#else
-    return GetDsoHandleFromSearchPath(FLAGS_cusparse_dir, "libmcsparse.so");
-#endif
-  } else {
-    std::string warning_msg(
-        "Your CUDA_VERSION is less than 11 or greater than 12, paddle "
-        "temporarily no longer.");
-    return nullptr;
-  }
-#elif defined(PADDLE_WITH_HIP)
-  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocsparse.so");
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libmcsparse.so");
-#endif
-}
-
-void* GetNVRTCDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.dylib", false);
-#elif defined(PADDLE_WITH_HIP)
-  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libamdhip64.so", false);
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libmcruntime.so", false);
-#endif
-}
-
-void* GetCUDADsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.dylib", false);
-#elif defined(PADDLE_WITH_HIP)
-  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libamdhip64.so", false);
-#elif defined(_WIN32)
-  char system32_dir[MAX_PATH];
-  GetSystemDirectory(system32_dir, MAX_PATH);
-  return GetDsoHandleFromSearchPath(system32_dir, "nvcuda.dll");
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libmcruntime.so", false);
-#endif
-}
-
-void* GetWarpCTCDsoHandle() {
-  std::string warpctc_dir = "";
-  if (!s_py_site_pkg_path.path.empty()) {
-    warpctc_dir = s_py_site_pkg_path.path;
-  }
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(warpctc_dir, "libwarpctc.dylib");
-#elif defined(_WIN32)
-  return GetDsoHandleFromSearchPath(warpctc_dir, "warpctc.dll");
-#else
-  return GetDsoHandleFromSearchPath(warpctc_dir, "libwarpctc.so");
-#endif
-}
-
-void* GetWarpRNNTDsoHandle() {
-  std::string warprnnt_dir = "";
-  if (!s_py_site_pkg_path.path.empty()) {
-    warprnnt_dir = s_py_site_pkg_path.path;
-  }
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(warprnnt_dir, "libwarprnnt.dylib");
-#elif defined(_WIN32)
-  return GetDsoHandleFromSearchPath(warprnnt_dir, "warprnnt.dll");
-#else
-  return GetDsoHandleFromSearchPath(warprnnt_dir, "libwarprnnt.so");
-#endif
-}
-
-void* GetFlashAttnDsoHandle() {
-  std::string flashattn_dir = "";
-  if (!s_py_site_pkg_path.path.empty()) {
-    flashattn_dir = s_py_site_pkg_path.path;
-  }
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(flashattn_dir, "libflashattn.dylib");
-#elif defined(_WIN32)
-  return GetDsoHandleFromSearchPath(flashattn_dir, "flashattn.dll");
-#else
-  return GetDsoHandleFromSearchPath(flashattn_dir, "libmcFlashAttn.so");
-#endif
-}
-
-void* GetFlashAttnV3DsoHandle() {
-  std::string flashattn_dir = "";
-  if (!s_py_site_pkg_path.path.empty()) {
-    flashattn_dir = s_py_site_pkg_path.path;
-  }
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(flashattn_dir, "libflashattnv3.dylib");
-#elif defined(_WIN32)
-  return GetDsoHandleFromSearchPath(flashattn_dir, "flashattnv3.dll");
-#else
-  return GetDsoHandleFromSearchPath(flashattn_dir, "libflashattnv3.so");
-#endif
-}
-
-void* GetAfsApiDsoHandle() {
-  std::string afsapi_dir = "";
-  if (!s_py_site_pkg_path.path.empty()) {
-    afsapi_dir = s_py_site_pkg_path.path;
-  }
-#if defined(__APPLE__) || defined(__OSX__) || defined(_WIN32)
-  return NULL;
-#else
-  return GetDsoHandleFromSearchPath(afsapi_dir, "libafs-api-so.so");
-#endif
-}
-
-void* GetNCCLDsoHandle() {
-#ifdef PADDLE_WITH_HIP
-  std::string warning_msg(
-      "You may need to install 'rccl' from ROCM official website: "
-      "https://rocmdocs.amd.com/en/latest/Installation_Guide/"
-      "Installation-Guide.html before install PaddlePaddle.");
-#else
-  std::string warning_msg(
-      "You may need to install 'nccl2' from NVIDIA official website: "
-      "https://developer.nvidia.com/nccl/nccl-download "
-      "before install PaddlePaddle.");
-#endif
-
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(
-      FLAGS_nccl_dir, "libnccl.dylib", true, {}, warning_msg);
-#elif defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL)
-  return GetDsoHandleFromSearchPath(
-      FLAGS_rccl_dir, "librccl.so", true, {}, warning_msg);
-#else
-#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
-  return GetDsoHandleFromSearchPath(
-      FLAGS_nccl_dir, "libnccl.so;libnccl.so.2", true, {}, warning_msg);
-#else
-  return GetDsoHandleFromSearchPath(
-      FLAGS_nccl_dir, "libmccl.so", true, {}, warning_msg);
-#endif
-
-#endif
-}
-
-// void* GetFLAGCXDsoHandle() {
-// #ifdef PADDLE_WITH_FLAGCX
-//   return GetDsoHandleFromSearchPath(FLAGS_flagcx_dir, "libflagcx.so");
-// #else
-//   return nullptr;
-// #endif
-// }
-
-void* GetTensorRtDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.dylib");
-#elif defined(_WIN32)
-  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "nvinfer.dll");
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.so");
-#endif
-}
-
-void* GetMKLMLDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.dylib");
-#elif defined(_WIN32)
-  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "mklml.dll");
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.so");
-#endif
-}
-
-void* GetLAPACKDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-#if defined(__arm__) || defined(__aarch64__)
-  return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapack.dylib");
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapack.3.dylib");
-#endif
-#elif defined(_WIN32)
-  return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapack.dll");
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapack.so.3");
-#endif
-}
-
-void* GetOpDsoHandle(const std::string& dso_name) {
-  return GetDsoHandleFromSearchPath(FLAGS_op_dir, dso_name);
-}
-
-void* GetNvtxDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  PADDLE_THROW(common::errors::Unimplemented("Nvtx do not support Apple."));
-#elif defined(_WIN32)
-  PADDLE_THROW(common::errors::Unimplemented("Nvtx do not support Windows."));
-#elif !defined(PADDLE_WITH_CUDA)
-  PADDLE_THROW(
-      common::errors::Unimplemented("Nvtx do not support without CUDA."));
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvToolsExt.so");
-#endif
-}
-
-void* GetCUFFTDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.dylib");
-#elif defined(__linux__) && defined(PADDLE_WITH_CUDA)
-  if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
-#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
-    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.so.10");
-#else
-    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.so");
-#endif
-  } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 13000) {
-    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.so.11");
-  } else {
-    std::string warning_msg(
-        "Your CUDA_VERSION is less than 11 or greater than 13, paddle "
-        "temporarily no longer.");
-    return nullptr;
-  }
-#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
-#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
-    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cufft64_10.dll");
-#else
-    return GetDsoHandleFromSearchPath(
-        FLAGS_cuda_dir, win_cufft_lib, true, {cuda_lib_path});
-#endif
-  } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 13000) {
-#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
-    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cufft64_11.dll");
-#else
-    return GetDsoHandleFromSearchPath(
-        FLAGS_cuda_dir, win_cufft_lib, true, {cuda_lib_path});
-#endif
-  } else {
-    std::string warning_msg(
-        "Your CUDA_VERSION is less than 11 or greater than 13, paddle "
-        "temporarily no longer supports");
-    return nullptr;
-  }
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.so");
-#endif
-}
-
-void* GetMKLRTDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_mkl_dir, "libmkl_rt.dylib");
-#elif defined(_WIN32)
-  return GetDsoHandleFromSearchPath(FLAGS_mkl_dir, "mkl_rt.dll");
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_mkl_dir, "libmkl_rt.so");
-#endif
-}
-
-void* GetCusparseLtDsoHandle() {
-// APIs available after CUDA 11.2
-#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11020 && 0
-  return GetDsoHandleFromSearchPath(FLAGS_cusparselt_dir, "libcusparseLt.so");
-#else
-  std::string warning_msg(
-      "Your CUDA_VERSION less 11.2, not support cusparseLt. "
-      "If you want to use cusparseLt, please upgrade CUDA and rebuild "
-      "PaddlePaddle.");
-  return nullptr;
-#endif
-}
-
-void* GetXPTIDsoHandle() {
-#ifdef PADDLE_WITH_XPTI
-  return GetDsoHandleFromSearchPath(FLAGS_xpti_dir, "libxpti.so");
-#else
-  return nullptr;
-#endif
-}
-}  // namespace phi::dynload
diff --git a/backends/metax_gpu/kernels/dynload/dynamic_loader.h b/backends/metax_gpu/kernels/dynload/dynamic_loader.h
deleted file mode 100644
index a5d3d0ff76c..00000000000
--- a/backends/metax_gpu/kernels/dynload/dynamic_loader.h
+++ /dev/null
@@ -1,61 +0,0 @@
-// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights
-// Reserved.
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-
-#include "paddle/utils/test_macros.h"
-namespace phi {
-namespace dynload {
-
-#ifndef _WIN32
-#define DECLARE_TYPE(__name, ...) decltype(__name(__VA_ARGS__))
-#else
-#define DECLARE_TYPE(__name, ...) decltype(auto)
-#endif
-
-void* GetCublasDsoHandle();
-void* GetCublasLtDsoHandle();
-TEST_API void* GetCUDNNDsoHandle();
-void* GetCUPTIDsoHandle();
-void* GetCurandDsoHandle();
-void* GetNvjpegDsoHandle();
-void* GetCusolverDsoHandle();
-void* GetCusparseDsoHandle();
-void* GetNVRTCDsoHandle();
-void* GetCUDADsoHandle();
-void* GetWarpCTCDsoHandle();
-void* GetWarpRNNTDsoHandle();
-void* GetFlashAttnDsoHandle();
-void* GetFlashAttnV3DsoHandle();
-void* GetNCCLDsoHandle();
-// void* GetFLAGCXDsoHandle();
-void* GetTensorRtDsoHandle();
-void* GetMKLMLDsoHandle();
-void* GetLAPACKDsoHandle();
-void* GetOpDsoHandle(const std::string& dso_name);
-void* GetNvtxDsoHandle();
-void* GetCUFFTDsoHandle();
-void* GetMKLRTDsoHandle();
-void* GetROCFFTDsoHandle();
-void* GetCusparseLtDsoHandle();
-void* GetXPTIDsoHandle();
-void* GetAfsApiDsoHandle();
-
-void SetPaddleLibPath(const std::string&);
-
-}  // namespace dynload
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/funcs/affine_grid_utils.h b/backends/metax_gpu/kernels/funcs/affine_grid_utils.h
index c137d9ad468..b973d75a9be 100644
--- a/backends/metax_gpu/kernels/funcs/affine_grid_utils.h
+++ b/backends/metax_gpu/kernels/funcs/affine_grid_utils.h
@@ -14,8 +14,8 @@
 
 #pragma once
 
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/backends/metax_gpu/kernels/funcs/blas/blas.cc b/backends/metax_gpu/kernels/funcs/blas/blas.cc
deleted file mode 100644
index 098a0400552..00000000000
--- a/backends/metax_gpu/kernels/funcs/blas/blas.cc
+++ /dev/null
@@ -1,59 +0,0 @@
-// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights
-// Reserved.
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// clang-format off
-#include "funcs/blas/blas.h" // NOLINT
-#include "paddle/phi/core/enforce.h"
-// clang-format on
-namespace phi {
-namespace funcs {
-MatDescriptor CreateMatrixDescriptor(const DDim &tensor_dim,
-                                     int num_flatten_cols,
-                                     bool trans) {
-  PADDLE_ENFORCE_GT(
-      tensor_dim.size(),
-      1,
-      phi::errors::InvalidArgument("The tensor dim size should be greater "
-                                   "than 1, but reveived dim size is %d",
-                                   tensor_dim.size()));
-  MatDescriptor retv;
-  if (num_flatten_cols > 1) {
-    auto flatten_dim = common::flatten_to_2d(tensor_dim, num_flatten_cols);
-    retv.height_ = flatten_dim[0];
-    retv.width_ = flatten_dim[1];
-  } else {
-    if (tensor_dim.size() == 2) {
-      retv.height_ = tensor_dim[0];
-      retv.width_ = tensor_dim[1];
-    } else {
-      auto dim_vec = common::vectorize(tensor_dim);
-      retv.batch_size_ = 1;
-      for (size_t i = 0; i < dim_vec.size() - 2; ++i) {
-        retv.batch_size_ *= dim_vec[i];
-      }
-      retv.height_ = dim_vec[dim_vec.size() - 2];
-      retv.width_ = dim_vec[dim_vec.size() - 1];
-      retv.stride_ = retv.height_ * retv.width_;
-    }
-  }
-  if (trans) {
-    std::swap(retv.width_, retv.height_);
-  }
-  retv.trans_ = trans;
-  return retv;
-}
-}  // namespace funcs
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/funcs/blas/blas.h b/backends/metax_gpu/kernels/funcs/blas/blas.h
deleted file mode 100644
index 75ea8c921e2..00000000000
--- a/backends/metax_gpu/kernels/funcs/blas/blas.h
+++ /dev/null
@@ -1,631 +0,0 @@
-// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights
-// Reserved.
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/phi/core/dense_tensor.h"
-
-#ifdef PADDLE_WITH_MKLML
-#include "paddle/phi/backends/dynload/mklml.h"
-#endif
-
-#ifdef PADDLE_WITH_LIBXSMM
-#include <libxsmm.h>
-#endif
-
-#if defined(PADDLE_USE_OPENBLAS) || defined(PADDLE_USE_REFERENCE_CBLAS)
-#include <cblas.h>
-#endif
-// #include "paddle/phi/core/enforce_metax.h"
-namespace phi {
-namespace funcs {
-
-/**
- * Matrix Descriptor of a memory buffer.
- *
- * It is used for Blas::MatMul. MatMul operator can be batched.
- * if Mat A is [BatchSize, H, W], Mat B is [BatchSize, H, W]. It will be a
- * `batch_size` times of GEMM. The batched GEMM could be faster base on the
- * implementation of the blas library. The batch size could be zero. If any
- * matrix of `matmul` has a batch size, there will be a batched GEMM, too. e.g.,
- * Mat A is [BatchSize, H1, W2], and Mat B [H2, W2], The result matrix wil be
- * [BatchSize, H1, W2]
- *
- * The boolean flag, `trans`, describe the memory is the transpose of matrix or
- * not. If the trans is true, the last two dims of matrix are transposed. The
- * memory layout of the matrix is [Width, Height] or [BatchSize, Width, Height].
- *
- * The MatDescriptor is not only the dimension or shape of a matrix, it also
- * contains the layout, stride of matrix. It is clearer to have a structure than
- * reuse `DDim`.
- */
-struct MatDescriptor {
-  int64_t height_;
-  int64_t width_;
-  int64_t stride_{0};
-  int64_t batch_size_{0};
-  bool trans_;
-};
-
-/**
- * Create Matrix Descriptor from a tensor dim, num_flatten_cols, and transpose
- * flag
- *
- * @param tensor_dim: The dimension of the tensor. The rank of this dimension
- * must larger than 1.
- *
- * @param num_flatten_cols:  Reshape a tensor to a matrix. The matrix's first
- * dimension(column length) will be the product of tensor's first `num_col_dims`
- * dimensions. If num_flatten_cols is zero, the first N-2 dimension will be the
- * batch_size of descriptor.
- *
- * @param trans: True if the matrix is transposed.
- */
-extern MatDescriptor CreateMatrixDescriptor(const DDim& tensor_dim,
-                                            int num_flatten_cols,
-                                            bool trans);
-
-template <typename DeviceContext>
-class Blas {
- public:
-  explicit Blas(const DeviceContext& context) : dev_ctx_(context) {}
-
-  template <typename T>
-  void GEMM(CBLAS_TRANSPOSE transA,
-            CBLAS_TRANSPOSE transB,
-            int64_t M,
-            int64_t N,
-            int64_t K,
-            T alpha,
-            const T* A,
-            const T* B,
-            T beta,
-            T* C) const;
-
-  template <typename T, typename U = T>
-  void GEMM(CBLAS_TRANSPOSE transA,
-            CBLAS_TRANSPOSE transB,
-            int64_t M,
-            int64_t N,
-            int64_t K,
-            U alpha,
-            const T* A,
-            const T* B,
-            U beta,
-            T* C) const;
-
-  template <typename T>
-  void GEMM(bool transA,
-            bool transB,
-            int M,
-            int N,
-            int K,
-            T alpha,
-            const T* A,
-            int lda,
-            const T* B,
-            int ldb,
-            T beta,
-            T* C,
-            int ldc) const;
-
-  template <typename T>
-  void GEMM(CBLAS_TRANSPOSE transA,
-            CBLAS_TRANSPOSE transB,
-            int M,
-            int N,
-            int K,
-            T alpha,
-            const T* A,
-            int lda,
-            const T* B,
-            int ldb,
-            T beta,
-            T* C,
-            int ldc) const;
-
-#ifdef PADDLE_WITH_MKLML  // @{ Group MKLML: class Blas
-  template <typename T>
-  T* GEMM_ALLOC(const CBLAS_IDENTIFIER id,
-                const int M,
-                const int N,
-                const int K) const;
-
-  template <typename T>
-  void GEMM_PACK(const CBLAS_IDENTIFIER id,
-                 const CBLAS_TRANSPOSE trans,
-                 int M,
-                 int N,
-                 int K,
-                 const T alpha,
-                 const T* src,
-                 const int ld,
-                 T* dst) const;
-
-  template <typename T>
-  void GEMM_COMPUTE(int transA,
-                    int transB,
-                    int M,
-                    int N,
-                    int K,
-                    const T* A,
-                    const int lda,
-                    const T* B,
-                    const int ldb,
-                    T beta,
-                    T* C,
-                    const int ldc) const;
-
-  template <typename T>
-  void GEMM_FREE(T* data) const;
-
-  template <typename T>
-  void CSRMM(const char* transa,
-             const int* m,
-             const int* n,
-             const int* k,
-             const T* alpha,
-             const char* matdescra,
-             const T* val,
-             const int* indx,
-             const int* pntrb,
-             const int* pntre,
-             const T* b,
-             const int* ldb,
-             const T* beta,
-             T* c,
-             const int* ldc) const;
-
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
-  template <typename T>
-  void MatMulWithHead(const phi::DenseTensor& mat_a,
-                      const MatDescriptor& dim_a,
-                      const phi::DenseTensor& mat_b,
-                      const MatDescriptor& dim_b,
-                      T alpha,
-                      int head_number,
-                      phi::DenseTensor* mat_out,
-                      T beta,
-                      bool mat_y_split_vertical) const;
-#endif
-#endif  // @} End Group MKLML: class Blas
-
-  template <typename T>
-  void MatMul(const int M,
-              const int N,
-              const int K,
-              const T* A,
-              const T* B,
-              T* C) const;
-
-  template <typename T>
-  void MatMul(const phi::DenseTensor& mat_a,
-              bool trans_a,
-              const phi::DenseTensor& mat_b,
-              bool trans_b,
-              T alpha,
-              phi::DenseTensor* mat_out,
-              T beta) const;
-
-  template <typename T>
-  void MatMul(const phi::DenseTensor& mat_a,
-              bool trans_a,
-              const phi::DenseTensor& mat_b,
-              bool trans_b,
-              phi::DenseTensor* mat_out) const {
-    MatMul(mat_a,
-           trans_a,
-           mat_b,
-           trans_b,
-           static_cast<T>(1.0),
-           mat_out,
-           static_cast<T>(0.0));
-  }
-
-  template <typename T>
-  void MatMul(const phi::DenseTensor& mat_a,
-              const phi::DenseTensor& mat_b,
-              phi::DenseTensor* mat_out) const {
-    this->template MatMul<T>(mat_a, false, mat_b, false, mat_out);
-  }
-
-  template <typename T>
-  void AXPY(int n, T alpha, const T* x, T* y) const;
-
-  template <typename T>
-  void VADD(int n, const T* x, const T* y, T* z) const;
-
-  template <typename T>
-  void VSUB(int n, const T* x, const T* y, T* z) const;
-
-  template <typename T>
-  void VMUL(int n, const T* x, const T* y, T* z) const;
-
-  template <typename T>
-  void VDIV(int n, const T* x, const T* y, T* z) const;
-
-  template <typename T>
-  void VCOPY(int n, const T* x, T* y) const;
-
-  template <typename T>
-  void VEXP(int n, const T* x, T* y) const;
-
-  template <typename T>
-  void VSQUARE(int n, const T* x, T* y) const;
-
-  template <typename T>
-  void VPOW(int n, const T* x, T alpha, T* y) const;
-
-  template <typename T>
-  void GEMV(bool trans_a,
-            int M,
-            int N,
-            T alpha,
-            const T* A,
-            const T* B,
-            T beta,
-            T* C) const;
-
-  template <typename T>
-  T DOT(int n, const T* x, const T* y) const;
-
-  template <typename T>
-  void CUDOT(
-      int n, const T* x, int incx, const T* y, int incy, T* result) const;
-  template <typename T>
-  void SCAL(int n, const T a, T* x) const;
-
-  template <typename T>
-  T ASUM(int n, T* x, int inc) const;
-
-  template <typename T>
-  void BatchedGEMM(CBLAS_TRANSPOSE transA,
-                   CBLAS_TRANSPOSE transB,
-                   int64_t M,
-                   int64_t N,
-                   int64_t K,
-                   T alpha,
-                   const T* A,
-                   const T* B,
-                   T beta,
-                   T* C,
-                   int64_t batchCount,
-                   int64_t strideA,
-                   int64_t strideB) const;
-
-  template <typename T, typename U = T>
-  void BatchedGEMM(CBLAS_TRANSPOSE transA,
-                   CBLAS_TRANSPOSE transB,
-                   int64_t M,
-                   int64_t N,
-                   int64_t K,
-                   U alpha,
-                   const T* A,
-                   const T* B,
-                   U beta,
-                   T* C,
-                   int64_t batchCount,
-                   int64_t strideA,
-                   int64_t strideB) const;
-
-  template <typename T>
-  void BatchedGEMM(CBLAS_TRANSPOSE transA,
-                   CBLAS_TRANSPOSE transB,
-                   int M,
-                   int N,
-                   int K,
-                   T alpha,
-                   const T** A,
-                   const T** B,
-                   T beta,
-                   T** C,
-                   int batchCount) const;
-
-#if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
-    !defined(PADDLE_WITH_HIP)
-  template <typename T>
-  void BatchedGEMMWithHead(CBLAS_TRANSPOSE transA,
-                           CBLAS_TRANSPOSE transB,
-                           int W1,
-                           int H1,
-                           int W2,
-                           int H2,
-                           T alpha,
-                           const T* A,
-                           const T* B,
-                           T beta,
-                           T* C,
-                           int batchCount,
-                           int64_t strideA,
-                           int64_t strideB,
-                           int64_t head_number,
-                           bool split_b_vertical) const;
-#endif
-
-  template <typename T>
-  void MatMul(const phi::DenseTensor& mat_a,
-              const MatDescriptor& dim_a,
-              const phi::DenseTensor& mat_b,
-              const MatDescriptor& dim_b,
-              T alpha,
-              phi::DenseTensor* mat_out,
-              T beta) const;
-
-  template <typename T>
-  void MatMul(const T* mat_a,
-              const MatDescriptor& dim_a,
-              const T* mat_b,
-              const MatDescriptor& dim_b,
-              T alpha,
-              T* mat_out,
-              T beta) const;
-
-  template <typename T>
-  void VINV(int n, const T* a, T* y) const;
-
-  template <typename T>
-  void VMERF(int n, const T* a, T* y, int64_t mode) const;
-
-  template <typename T>
-  void TRSM(CBLAS_SIDE side,
-            CBLAS_UPLO uplo,
-            CBLAS_TRANSPOSE transA,
-            CBLAS_DIAG diag,
-            int M,
-            int N,
-            T alpha,
-            const T* A,
-            int lda,
-            T* B,
-            int ldb) const;
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  template <typename T>
-  void BatchedGETRF(int n, T** a, int* ipiv, int* info, int batch_size) const;
-
-  template <typename T>
-  void BatchedGETRI(int n,
-                    const T** a,
-                    const int* ipiv,
-                    T** a_inv,
-                    int* info,
-                    int batch_size) const;
-
-  template <typename T>
-  void BatchedMatInv(
-      int n, const T** a, T** a_inv, int* info, int batch_size) const;
-
-  // cuBlas solve
-  template <typename T>
-  void BatchedGETRS(CBLAS_TRANSPOSE trans,
-                    int n,
-                    int nrhs,
-                    const T** a,
-                    int lda,
-                    int* ipiv,
-                    T** b,
-                    int ldb,
-                    int* info,
-                    int batch_size) const;
-
-  // cuBlas triangular_solve
-  template <typename T>
-  void BatchedTRSM(CBLAS_SIDE side,
-                   CBLAS_UPLO uplo,
-                   CBLAS_TRANSPOSE transA,
-                   CBLAS_DIAG diag,
-                   int M,
-                   int N,
-                   T alpha,
-                   const T** a,
-                   int lda,
-                   T** b,
-                   int ldb,
-                   int batch_size) const;
-#endif
-
- private:
-  const DeviceContext& dev_ctx_;
-};
-
-template <typename DeviceContext, typename T>
-class BlasT : private Blas<DeviceContext> {
- public:
-  using Blas<DeviceContext>::Blas;
-
-  template <typename... ARGS>
-  void GEMM(ARGS... args) const {
-    Base()->template GEMM<T>(args...);
-  }
-
-#ifdef PADDLE_WITH_MKLML  // @{ Group MKLML: class BlasT
-  template <typename... ARGS>
-  T* GEMM_ALLOC(ARGS... args) const {
-    return Base()->template GEMM_ALLOC<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void GEMM_PACK(ARGS... args) const {
-    Base()->template GEMM_PACK<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void GEMM_COMPUTE(ARGS... args) const {
-    Base()->template GEMM_COMPUTE<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void GEMM_FREE(ARGS... args) const {
-    Base()->template GEMM_FREE<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void CSRMM(ARGS... args) const {
-    Base()->template CSRMM<T>(args...);
-  }
-
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
-  template <typename... ARGS>
-  void MatMulWithHead(ARGS... args) const {
-    Base()->template MatMulWithHead<T>(args...);
-  }
-#endif
-#endif  // @} End Group MKLML: class BlasT
-
-  template <typename... ARGS>
-  void MatMul(ARGS... args) const {
-    Base()->template MatMul<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void AXPY(ARGS... args) const {
-    Base()->template AXPY<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void VADD(ARGS... args) const {
-    Base()->template VADD<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void VSUB(ARGS... args) const {
-    Base()->template VSUB<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void VMUL(ARGS... args) const {
-    Base()->template VMUL<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void VDIV(ARGS... args) const {
-    Base()->template VDIV<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void VCOPY(ARGS... args) const {
-    Base()->template VCOPY<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void VEXP(ARGS... args) const {
-    Base()->template VEXP<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void VSQUARE(ARGS... args) const {
-    Base()->template VSQUARE<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void VPOW(ARGS... args) const {
-    Base()->template VPOW<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void GEMV(ARGS... args) const {
-    Base()->template GEMV<T>(args...);
-  }
-
-  template <typename... ARGS>
-  T DOT(ARGS... args) const {
-    return Base()->template DOT<T>(args...);
-  }
-  template <typename... ARGS>
-  void CUDOT(ARGS... args) const {
-    Base()->template CUDOT<T>(args...);
-  }
-  template <typename... ARGS>
-  void SCAL(ARGS... args) const {
-    Base()->template SCAL<T>(args...);
-  }
-
-  template <typename... ARGS>
-  T ASUM(ARGS... args) const {
-    return Base()->template ASUM<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void BatchedGEMM(ARGS... args) const {
-    Base()->template BatchedGEMM<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void VINV(ARGS... args) const {
-    Base()->template VINV<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void VMERF(ARGS... args) const {
-    Base()->template VMERF<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void TRSM(ARGS... args) const {
-    Base()->template TRSM<T>(args...);
-  }
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  template <typename... ARGS>
-  void BatchedGETRF(ARGS... args) const {
-    Base()->template BatchedGETRF<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void BatchedGETRI(ARGS... args) const {
-    Base()->template BatchedGETRI<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void BatchedMatInv(ARGS... args) const {
-    Base()->template BatchedMatInv<T>(args...);
-  }
-
-  // solve
-  template <typename... ARGS>
-  void BatchedGETRS(ARGS... args) const {
-    Base()->template BatchedGETRS<T>(args...);
-  }
-
-  // triangular_solve
-  template <typename... ARGS>
-  void BatchedTRSM(ARGS... args) const {
-    Base()->template BatchedTRSM<T>(args...);
-  }
-#endif
-
- private:
-  const Blas<DeviceContext>* Base() const {
-    return static_cast<const Blas<DeviceContext>*>(this);
-  }
-};
-
-template <typename DeviceContext, typename T>
-inline BlasT<DeviceContext, T> GetBlas(const DeviceContext& dev_ctx) {
-  return BlasT<DeviceContext, T>(dev_ctx);
-}
-
-}  // namespace funcs
-}  // namespace phi
-// clang-format off
-#include "./blas_impl.h"
-#ifdef PADDLE_WITH_CUDA
-#include "./blas_impl.cu.h"
-#endif
-#ifdef PADDLE_WITH_HIP
-#include "paddle/phi/kernels/funcs/blas/blas_impl.hip.h"
-#endif
-// clang-format on
diff --git a/backends/metax_gpu/kernels/funcs/blas/blas_impl.cu.h b/backends/metax_gpu/kernels/funcs/blas/blas_impl.cu.h
deleted file mode 100644
index ae4baa52613..00000000000
--- a/backends/metax_gpu/kernels/funcs/blas/blas_impl.cu.h
+++ /dev/null
@@ -1,3027 +0,0 @@
-// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights
-// Reserved.
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#if defined(__NVCC__)
-#include <thrust/device_vector.h>
-#endif
-#include "./cublas.h"
-#include "glog/logging.h"
-#include "paddle/common/flags.h"
-#include "paddle/phi/backends/gpu/gpu_context.h"
-// #include "paddle/phi/core/flags.h"
-#include "paddle/phi/core/enforce.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-#define INT_MAX_VALUE 2147483647
-
-PHI_DECLARE_bool(enable_cublas_tensor_op_math);
-PHI_DECLARE_bool(gemm_use_half_precision_compute_type);
-
-namespace phi {
-namespace funcs {
-template <typename T>
-struct CUBlas;
-
-template <>
-struct CUBlas<float> {
-  template <typename... ARGS>
-  static void GEMM(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasSgemm(args...));
-  }
-
-  template <typename... ARGS>
-  static void AXPY(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasSaxpy(args...));
-  }
-
-  template <typename... ARGS>
-  static void SCAL(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasSscal(args...));
-  }
-
-  template <typename... ARGS>
-  static void VCOPY(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasScopy(args...));
-  }
-
-  template <typename... ARGS>
-  static void GEMV(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasSgemv(args...));
-  }
-
-  template <typename... ARGS>
-  static void GEMM_BATCH(ARGS... args) {
-#if CUDA_VERSION >= 8000
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasSgemmBatched(args...));
-#else
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "SgemmBatched is not supported on cuda <= 7.5"));
-#endif
-  }
-
-  template <typename... ARGS>
-  static void GEMM_STRIDED_BATCH(ARGS... args) {
-#if CUDA_VERSION >= 8000
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cublasSgemmStridedBatched(args...));
-#else
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "SgemmStridedBatched is not supported on cuda <= 7.5"));
-#endif
-  }
-
-  // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
-  // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
-  template <typename... ARGS>
-  static void GEMM_EX(phi::GPUContext *dev_ctx,
-                      cublasOperation_t transa,
-                      cublasOperation_t transb,
-                      int m,
-                      int n,
-                      int k,
-                      const float *alpha,
-                      const void *A,
-                      cudaDataType_t Atype,
-                      int lda,
-                      const void *B,
-                      cudaDataType_t Btype,
-                      int ldb,
-                      const float *beta,
-                      void *C,
-                      cudaDataType_t Ctype,
-                      int ldc) {
-// Because the gcc 4.8 doesn't expand template parameter pack that
-// appears in a lambda-expression, I can not use template parameter pack
-// here.
-#if CUDA_VERSION >= 8000
-    VLOG(5) << "use_tensor_op_math: "
-            << (dev_ctx->tensor_core_available() ? "True" : "False");
-    dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasSgemmEx(handle,
-                                                             transa,
-                                                             transb,
-                                                             m,
-                                                             n,
-                                                             k,
-                                                             alpha,
-                                                             A,
-                                                             Atype,
-                                                             lda,
-                                                             B,
-                                                             Btype,
-                                                             ldb,
-                                                             beta,
-                                                             C,
-                                                             Ctype,
-                                                             ldc));
-    });
-#else
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "cublasSgemmEx is not supported on cuda <= 7.5"));
-#endif
-  }
-
-  template <typename... ARGS>
-  static void TRSM(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasStrsm(args...));
-  }
-
-  template <typename... ARGS>
-  static void GETRF_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasSgetrfBatched(args...));
-  }
-
-  template <typename... ARGS>
-  static void GETRI_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasSgetriBatched(args...));
-  }
-
-  template <typename... ARGS>
-  static void MATINV_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasSmatinvBatched(args...));
-  }
-
-  template <typename... ARGS>
-  static void GETRS_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasSgetrsBatched(args...));
-  }
-
-  template <typename... ARGS>
-  static void TRSM_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasStrsmBatched(args...));
-  }
-};
-
-template <>
-struct CUBlas<double> {
-  template <typename... ARGS>
-  static void GEMM(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasDgemm(args...));
-  }
-
-  template <typename... ARGS>
-  static void AXPY(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasDaxpy(args...));
-  }
-
-  template <typename... ARGS>
-  static void SCAL(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasDscal(args...));
-  }
-
-  template <typename... ARGS>
-  static void VCOPY(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasDcopy(args...));
-  }
-
-  template <typename... ARGS>
-  static void GEMV(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasDgemv(args...));
-  }
-
-  template <typename... ARGS>
-  static void GEMM_BATCH(ARGS... args) {
-#if CUDA_VERSION >= 8000
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasDgemmBatched(args...));
-#else
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "DgemmBatched is not supported on cuda <= 7.5"));
-#endif
-  }
-
-  template <typename... ARGS>
-  static void GEMM_STRIDED_BATCH(ARGS... args) {
-#if CUDA_VERSION >= 8000
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cublasDgemmStridedBatched(args...));
-#else
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "DgemmStridedBatched is not supported on cuda <= 7.5"));
-#endif
-  }
-
-  template <typename... ARGS>
-  static void GEMM_EX(ARGS... args UNUSED) {
-    PADDLE_THROW(
-        phi::errors::Unimplemented("Currently there are not cublasDgemmEx."));
-  }
-
-  template <typename... ARGS>
-  static void TRSM(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasDtrsm(args...));
-  }
-
-  template <typename... ARGS>
-  static void GETRF_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasDgetrfBatched(args...));
-  }
-
-  template <typename... ARGS>
-  static void GETRI_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasDgetriBatched(args...));
-  }
-
-  template <typename... ARGS>
-  static void MATINV_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasDmatinvBatched(args...));
-  }
-
-  template <typename... ARGS>
-  static void GETRS_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasDgetrsBatched(args...));
-  }
-
-  template <typename... ARGS>
-  static void TRSM_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasDtrsmBatched(args...));
-  }
-};
-
-template <>
-struct CUBlas<phi::dtype::float16> {
-  using float16 = phi::dtype::float16;
-
-  static void GEMM(cublasHandle_t handle,
-                   cublasOperation_t transa,
-                   cublasOperation_t transb,
-                   int m,
-                   int n,
-                   int k,
-                   const float16 *alpha,
-                   const float16 *A,
-                   int lda,
-                   const float16 *B,
-                   int ldb,
-                   const float16 *beta,
-                   float16 *C,
-                   int ldc) {
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cublasHgemm(handle,
-                                  transa,
-                                  transb,
-                                  m,
-                                  n,
-                                  k,
-                                  reinterpret_cast<const mcblas_half *>(alpha),
-                                  reinterpret_cast<const mcblas_half *>(A),
-                                  lda,
-                                  reinterpret_cast<const mcblas_half *>(B),
-                                  ldb,
-                                  reinterpret_cast<const mcblas_half *>(beta),
-                                  reinterpret_cast<mcblas_half *>(C),
-                                  ldc));
-  }
-
-#if defined(__NVCC__)
-  static void GEMM_BATCH(phi::GPUContext *dev_ctx,
-                         cublasOperation_t transa,
-                         cublasOperation_t transb,
-                         int m,
-                         int n,
-                         int k,
-                         const float *alpha,
-                         const float16 **A,
-                         cudaDataType_t Atype,
-                         int lda,
-                         const float16 **B,
-                         cudaDataType_t Btype,
-                         int ldb,
-                         const float *beta,
-                         float16 **C,
-                         cudaDataType_t Ctype,
-                         int ldc,
-                         int batchCount,
-                         cublasComputeType_t computeType) {
-#if CUDA_VERSION >= 8000
-    cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-#if CUDA_VERSION >= 9000
-    bool use_tensor_op_math = dev_ctx->tensor_core_available();
-    if (use_tensor_op_math) {
-      algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
-    }
-    VLOG(5) << "use_tensor_op_math: "
-            << (use_tensor_op_math ? "True" : "False");
-#endif  // CUDA_VERSION >= 9000
-    thrust::device_vector<const void *> A_ptr(A, A + batchCount);
-    thrust::device_vector<const void *> B_ptr(B, B + batchCount);
-    thrust::device_vector<void *> C_ptr(C, C + batchCount);
-    dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::cublasGemmBatchedEx(handle,
-                                            transa,
-                                            transb,
-                                            m,
-                                            n,
-                                            k,
-                                            alpha,
-                                            A_ptr.data().get(),
-                                            Atype,
-                                            lda,
-                                            B_ptr.data().get(),
-                                            Btype,
-                                            ldb,
-                                            beta,
-                                            C_ptr.data().get(),
-                                            Ctype,
-                                            ldc,
-                                            batchCount,
-                                            computeType,
-                                            algo));
-    });
-#else
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "cublasGemmBatchedEx is not supported on cuda <= 7.5"));
-#endif
-  }
-#endif
-
-  static void GEMM_STRIDED_BATCH(cublasHandle_t handle,
-                                 cublasOperation_t transa,
-                                 cublasOperation_t transb,
-                                 int m,
-                                 int n,
-                                 int k,
-                                 const float16 *alpha,
-                                 const float16 *A,
-                                 int lda,
-                                 long long int strideA,  // NOLINT
-                                 const float16 *B,       // NOLINT
-                                 int ldb,
-                                 long long int strideB,  // NOLINT
-                                 const float16 *beta,
-                                 float16 *C,
-                                 int ldc,
-                                 long long int strideC,  // NOLINT
-                                 int batchCount) {
-#if CUDA_VERSION >= 8000
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasHgemmStridedBatched(
-        handle,
-        transa,
-        transb,
-        m,
-        n,
-        k,
-        reinterpret_cast<const mcblas_half *>(alpha),
-        reinterpret_cast<const mcblas_half *>(A),
-        lda,
-        strideA,
-        reinterpret_cast<const mcblas_half *>(B),
-        ldb,
-        strideB,
-        reinterpret_cast<const mcblas_half *>(beta),
-        reinterpret_cast<mcblas_half *>(C),
-        ldc,
-        strideC,
-        batchCount));
-#else
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "HgemmStridedBatched is not supported on cuda <= 7.5"));
-#endif
-  }
-
-  // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
-  // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
-  template <typename... ARGS>
-  static void GEMM_EX(phi::GPUContext *dev_ctx,
-                      cublasOperation_t transa,
-                      cublasOperation_t transb,
-                      int m,
-                      int n,
-                      int k,
-                      const void *alpha,
-                      const void *A,
-                      cudaDataType_t Atype,
-                      int lda,
-                      const void *B,
-                      cudaDataType_t Btype,
-                      int ldb,
-                      const void *beta,
-                      void *C,
-                      cudaDataType_t Ctype,
-                      int ldc,
-                      cublasComputeType_t computeType) {
-#if CUDA_VERSION >= 8000
-    cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-#if CUDA_VERSION >= 9000
-    bool use_tensor_op_math = dev_ctx->tensor_core_available();
-    if (use_tensor_op_math) {
-      algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
-    }
-    VLOG(5) << "use_tensor_op_math: "
-            << (use_tensor_op_math ? "True" : "False");
-#endif  // CUDA_VERSION >= 9000
-
-    dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle,
-                                                            transa,
-                                                            transb,
-                                                            m,
-                                                            n,
-                                                            k,
-                                                            alpha,
-                                                            A,
-                                                            Atype,
-                                                            lda,
-                                                            B,
-                                                            Btype,
-                                                            ldb,
-                                                            beta,
-                                                            C,
-                                                            Ctype,
-                                                            ldc,
-                                                            computeType,
-                                                            algo));
-    });
-#else
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "cublasGemmEx is not supported on cuda <= 7.5"));
-#endif
-  }
-};
-
-template <>
-struct CUBlas<phi::dtype::complex<float>> {
-  static void GEMV(cublasHandle_t handle,
-                   cublasOperation_t transa,
-                   int m,
-                   int n,
-                   const phi::dtype::complex<float> *alpha,
-                   const phi::dtype::complex<float> *A,
-                   int lda,
-                   const phi::dtype::complex<float> *B,
-                   int ldb,
-                   const phi::dtype::complex<float> *beta,
-                   phi::dtype::complex<float> *C,
-                   int ldc) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasCgemv(
-        handle,
-        transa,
-        m,
-        n,
-        reinterpret_cast<const cuFloatComplex *>(alpha),
-        reinterpret_cast<const cuFloatComplex *>(A),
-        lda,
-        reinterpret_cast<const cuFloatComplex *>(B),
-        ldb,
-        reinterpret_cast<const cuFloatComplex *>(beta),
-        reinterpret_cast<cuFloatComplex *>(C),
-        ldc));
-  }
-
-  static void AXPY(cublasHandle_t handle,
-                   int n,
-                   const phi::dtype::complex<float> *alpha,
-                   const phi::dtype::complex<float> *X,
-                   const int incX,
-                   phi::dtype::complex<float> *Y,
-                   const int incY) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasCaxpy(
-        handle,
-        n,
-        reinterpret_cast<const cuFloatComplex *>(alpha),
-        reinterpret_cast<const cuFloatComplex *>(X),
-        incX,
-        reinterpret_cast<cuFloatComplex *>(Y),
-        incY));
-  }
-
-  static void GEMM_STRIDED_BATCH(cublasHandle_t handle,
-                                 cublasOperation_t transa,
-                                 cublasOperation_t transb,
-                                 int m,
-                                 int n,
-                                 int k,
-                                 const phi::dtype::complex<float> *alpha,
-                                 const phi::dtype::complex<float> *A,
-                                 int lda,
-                                 long long int strideA,                // NOLINT
-                                 const phi::dtype::complex<float> *B,  // NOLINT
-                                 int ldb,
-                                 long long int strideB,  // NOLINT
-                                 const phi::dtype::complex<float> *beta,
-                                 phi::dtype::complex<float> *C,
-                                 int ldc,
-                                 long long int strideC,  // NOLINT
-                                 int batchCount) {
-#if CUDA_VERSION >= 8000
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasCgemmStridedBatched(
-        handle,
-        transa,
-        transb,
-        m,
-        n,
-        k,
-        reinterpret_cast<const cuFloatComplex *>(alpha),
-        reinterpret_cast<const cuFloatComplex *>(A),
-        lda,
-        strideA,
-        reinterpret_cast<const cuFloatComplex *>(B),
-        ldb,
-        strideB,
-        reinterpret_cast<const cuFloatComplex *>(beta),
-        reinterpret_cast<cuFloatComplex *>(C),
-        ldc,
-        strideC,
-        batchCount));
-#else
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "CgemmStridedBatched is not supported on cuda <= 7.5"));
-#endif
-  }
-
-  static void GEMM(cublasHandle_t handle,
-                   cublasOperation_t transa,
-                   cublasOperation_t transb,
-                   int m,
-                   int n,
-                   int k,
-                   const phi::dtype::complex<float> *alpha,
-                   const phi::dtype::complex<float> *A,
-                   int lda,
-                   const phi::dtype::complex<float> *B,
-                   int ldb,
-                   const phi::dtype::complex<float> *beta,
-                   phi::dtype::complex<float> *C,
-                   int ldc) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasCgemm(
-        handle,
-        transa,
-        transb,
-        m,
-        n,
-        k,
-        reinterpret_cast<const cuFloatComplex *>(alpha),
-        reinterpret_cast<const cuFloatComplex *>(A),
-        lda,
-        reinterpret_cast<const cuFloatComplex *>(B),
-        ldb,
-        reinterpret_cast<const cuFloatComplex *>(beta),
-        reinterpret_cast<cuFloatComplex *>(C),
-        ldc));
-  }
-
-  static void TRSM(cublasHandle_t handle,
-                   cublasSideMode_t side,
-                   cublasFillMode_t uplo,
-                   cublasOperation_t transa,
-                   cublasDiagType_t diag,
-                   int m,
-                   int n,
-                   const phi::dtype::complex<float> *alpha,
-                   const phi::dtype::complex<float> *A,
-                   int lda,
-                   phi::dtype::complex<float> *B,
-                   int ldb) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasCtrsm(
-        handle,
-        side,
-        uplo,
-        transa,
-        diag,
-        m,
-        n,
-        reinterpret_cast<const cuFloatComplex *>(alpha),
-        reinterpret_cast<const cuFloatComplex *>(A),
-        lda,
-        reinterpret_cast<cuFloatComplex *>(B),
-        ldb));
-  }
-
-  // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
-  // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
-  template <typename... ARGS>
-  static void GEMM_EX(phi::GPUContext *dev_ctx,
-                      cublasOperation_t transa,
-                      cublasOperation_t transb,
-                      int m,
-                      int n,
-                      int k,
-                      const void *alpha,
-                      const void *A,
-                      cudaDataType_t Atype,
-                      int lda,
-                      const void *B,
-                      cudaDataType_t Btype,
-                      int ldb,
-                      const void *beta,
-                      void *C,
-                      cudaDataType_t Ctype,
-                      int ldc,
-                      cublasComputeType_t computeType) {
-#if CUDA_VERSION >= 8000
-    cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-#if CUDA_VERSION >= 9000
-    bool use_tensor_op_math = dev_ctx->tensor_core_available();
-    if (use_tensor_op_math) {
-      algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
-    }
-    VLOG(5) << "use_tensor_op_math: "
-            << (use_tensor_op_math ? "True" : "False");
-#endif  // CUDA_VERSION >= 9000
-
-    dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle,
-                                                            transa,
-                                                            transb,
-                                                            m,
-                                                            n,
-                                                            k,
-                                                            alpha,
-                                                            A,
-                                                            Atype,
-                                                            lda,
-                                                            B,
-                                                            Btype,
-                                                            ldb,
-                                                            beta,
-                                                            C,
-                                                            Ctype,
-                                                            ldc,
-                                                            computeType,
-                                                            algo));
-    });
-#else
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "cublasGemmEx is not supported on cuda <= 7.5"));
-#endif
-  }
-
-  static void TRSM_BATCH(cublasHandle_t handle,
-                         cublasSideMode_t side,
-                         cublasFillMode_t uplo,
-                         cublasOperation_t transa,
-                         cublasDiagType_t diag,
-                         int m,
-                         int n,
-                         const phi::dtype::complex<float> *alpha,
-                         const phi::dtype::complex<float> **A,
-                         int lda,
-                         phi::dtype::complex<float> **B,
-                         int ldb,
-                         int batch_size) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasCtrsmBatched(
-        handle,
-        side,
-        uplo,
-        transa,
-        diag,
-        m,
-        n,
-        reinterpret_cast<const cuFloatComplex *>(alpha),
-        reinterpret_cast<const cuFloatComplex **>(A),
-        lda,
-        reinterpret_cast<cuFloatComplex **>(B),
-        ldb,
-        batch_size));
-  }
-  // ****************************************************************新增模版定义*********************
-
-  static void GETRF_BATCH(cublasHandle_t handle,
-                          int n,
-                          phi::dtype::complex<float> **A,
-                          int lda,
-                          int *ipiv,
-                          int *info,
-                          int batch_size) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasCgetrfBatched(
-        handle,
-        n,
-        reinterpret_cast<cuFloatComplex **>(A),
-        lda,
-        ipiv,
-        info,
-        batch_size));
-  }
-
-  static void GETRI_BATCH(cublasHandle_t handle,
-                          int n,
-                          const phi::dtype::complex<float> **A,
-                          int lda,
-                          const int *ipiv,
-                          phi::dtype::complex<float> **Ainv,
-                          int ldc,
-                          int *info,
-                          int batch_size) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasCgetriBatched(
-        handle,
-        n,
-        reinterpret_cast<const cuFloatComplex **>(A),
-        lda,
-        ipiv,
-        reinterpret_cast<cuFloatComplex **>(Ainv),
-        ldc,
-        info,
-        batch_size));
-  }
-
-  static void MATINV_BATCH(cublasHandle_t handle,
-                           int n,
-                           const phi::dtype::complex<float> **A,
-                           int lda,
-                           phi::dtype::complex<float> **Ainv,
-                           int lda_inv,
-                           int *info,
-                           int batch_size) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasCmatinvBatched(
-        handle,
-        n,
-        reinterpret_cast<const cuFloatComplex **>(A),
-        lda,
-        reinterpret_cast<cuFloatComplex **>(Ainv),
-        lda_inv,
-        info,
-        batch_size));
-  }
-  // ****************************************************************新增模版定义*********************
-};
-
-template <>
-struct CUBlas<phi::dtype::complex<double>> {
-  static void GEMV(cublasHandle_t handle,
-                   cublasOperation_t transa,
-                   int m,
-                   int n,
-                   const phi::dtype::complex<double> *alpha,
-                   const phi::dtype::complex<double> *A,
-                   int lda,
-                   const phi::dtype::complex<double> *B,
-                   int ldb,
-                   const phi::dtype::complex<double> *beta,
-                   phi::dtype::complex<double> *C,
-                   int ldc) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasZgemv(
-        handle,
-        transa,
-        m,
-        n,
-        reinterpret_cast<const cuDoubleComplex *>(alpha),
-        reinterpret_cast<const cuDoubleComplex *>(A),
-        lda,
-        reinterpret_cast<const cuDoubleComplex *>(B),
-        ldb,
-        reinterpret_cast<const cuDoubleComplex *>(beta),
-        reinterpret_cast<cuDoubleComplex *>(C),
-        ldc));
-  }
-
-  static void AXPY(cublasHandle_t handle,
-                   int n,
-                   const phi::dtype::complex<double> *alpha,
-                   const phi::dtype::complex<double> *X,
-                   const int incX,
-                   phi::dtype::complex<double> *Y,
-                   const int incY) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasZaxpy(
-        handle,
-        n,
-        reinterpret_cast<const cuDoubleComplex *>(alpha),
-        reinterpret_cast<const cuDoubleComplex *>(X),
-        incX,
-        reinterpret_cast<cuDoubleComplex *>(Y),
-        incY));
-  }
-
-  static void GEMM_STRIDED_BATCH(
-      cublasHandle_t handle,
-      cublasOperation_t transa,
-      cublasOperation_t transb,
-      int m,
-      int n,
-      int k,
-      const phi::dtype::complex<double> *alpha,
-      const phi::dtype::complex<double> *A,
-      int lda,
-      long long int strideA,                 // NOLINT
-      const phi::dtype::complex<double> *B,  // NOLINT
-      int ldb,
-      long long int strideB,  // NOLINT
-      const phi::dtype::complex<double> *beta,
-      phi::dtype::complex<double> *C,
-      int ldc,
-      long long int strideC,  // NOLINT
-      int batchCount) {
-#if CUDA_VERSION >= 8000
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasZgemmStridedBatched(
-        handle,
-        transa,
-        transb,
-        m,
-        n,
-        k,
-        reinterpret_cast<const cuDoubleComplex *>(alpha),
-        reinterpret_cast<const cuDoubleComplex *>(A),
-        lda,
-        strideA,
-        reinterpret_cast<const cuDoubleComplex *>(B),
-        ldb,
-        strideB,
-        reinterpret_cast<const cuDoubleComplex *>(beta),
-        reinterpret_cast<cuDoubleComplex *>(C),
-        ldc,
-        strideC,
-        batchCount));
-#else
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "CgemmStridedBatched is not supported on cuda <= 7.5"));
-#endif
-  }
-
-  static void GEMM(cublasHandle_t handle,
-                   cublasOperation_t transa,
-                   cublasOperation_t transb,
-                   int m,
-                   int n,
-                   int k,
-                   const phi::dtype::complex<double> *alpha,
-                   const phi::dtype::complex<double> *A,
-                   int lda,
-                   const phi::dtype::complex<double> *B,
-                   int ldb,
-                   const phi::dtype::complex<double> *beta,
-                   phi::dtype::complex<double> *C,
-                   int ldc) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasZgemm(
-        handle,
-        transa,
-        transb,
-        m,
-        n,
-        k,
-        reinterpret_cast<const cuDoubleComplex *>(alpha),
-        reinterpret_cast<const cuDoubleComplex *>(A),
-        lda,
-        reinterpret_cast<const cuDoubleComplex *>(B),
-        ldb,
-        reinterpret_cast<const cuDoubleComplex *>(beta),
-        reinterpret_cast<cuDoubleComplex *>(C),
-        ldc));
-  }
-
-  static void TRSM(cublasHandle_t handle,
-                   cublasSideMode_t side,
-                   cublasFillMode_t uplo,
-                   cublasOperation_t transa,
-                   cublasDiagType_t diag,
-                   int m,
-                   int n,
-                   const phi::dtype::complex<double> *alpha,
-                   const phi::dtype::complex<double> *A,
-                   int lda,
-                   phi::dtype::complex<double> *B,
-                   int ldb) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasZtrsm(
-        handle,
-        side,
-        uplo,
-        transa,
-        diag,
-        m,
-        n,
-        reinterpret_cast<const cuDoubleComplex *>(alpha),
-        reinterpret_cast<const cuDoubleComplex *>(A),
-        lda,
-        reinterpret_cast<cuDoubleComplex *>(B),
-        ldb));
-  }
-
-  static void TRSM_BATCH(cublasHandle_t handle,
-                         cublasSideMode_t side,
-                         cublasFillMode_t uplo,
-                         cublasOperation_t transa,
-                         cublasDiagType_t diag,
-                         int m,
-                         int n,
-                         const phi::dtype::complex<double> *alpha,
-                         const phi::dtype::complex<double> **A,
-                         int lda,
-                         phi::dtype::complex<double> **B,
-                         int ldb,
-                         int batch_size) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasZtrsmBatched(
-        handle,
-        side,
-        uplo,
-        transa,
-        diag,
-        m,
-        n,
-        reinterpret_cast<const cuDoubleComplex *>(alpha),
-        reinterpret_cast<const cuDoubleComplex **>(A),
-        lda,
-        reinterpret_cast<cuDoubleComplex **>(B),
-        ldb,
-        batch_size));
-  }
-
-  // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
-  // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
-  template <typename... ARGS>
-  static void GEMM_EX(phi::GPUContext *dev_ctx,
-                      cublasOperation_t transa,
-                      cublasOperation_t transb,
-                      int m,
-                      int n,
-                      int k,
-                      const void *alpha,
-                      const void *A,
-                      cudaDataType_t Atype,
-                      int lda,
-                      const void *B,
-                      cudaDataType_t Btype,
-                      int ldb,
-                      const void *beta,
-                      void *C,
-                      cudaDataType_t Ctype,
-                      int ldc,
-                      cublasComputeType_t computeType) {
-#if CUDA_VERSION >= 8000
-    cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-#if CUDA_VERSION >= 9000
-    bool use_tensor_op_math = dev_ctx->tensor_core_available();
-    if (use_tensor_op_math) {
-      algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
-    }
-    VLOG(5) << "use_tensor_op_math: "
-            << (use_tensor_op_math ? "True" : "False");
-#endif  // CUDA_VERSION >= 9000
-
-    dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle,
-                                                            transa,
-                                                            transb,
-                                                            m,
-                                                            n,
-                                                            k,
-                                                            alpha,
-                                                            A,
-                                                            Atype,
-                                                            lda,
-                                                            B,
-                                                            Btype,
-                                                            ldb,
-                                                            beta,
-                                                            C,
-                                                            Ctype,
-                                                            ldc,
-                                                            computeType,
-                                                            algo));
-    });
-#else
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "cublasGemmEx is not supported on cuda <= 7.5"));
-#endif
-  }
-  // &*******************************************新增模版定义*************************
-  static void GETRF_BATCH(cublasHandle_t handle,
-                          int n,
-                          phi::dtype::complex<double> **A,
-                          int lda,
-                          int *ipiv,
-                          int *info,
-                          int batch_size) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasZgetrfBatched(
-        handle,
-        n,
-        reinterpret_cast<cuDoubleComplex **>(A),
-        lda,
-        ipiv,
-        info,
-        batch_size));
-  }
-
-  static void GETRI_BATCH(cublasHandle_t handle,
-                          int n,
-                          const phi::dtype::complex<double> **A,
-                          int lda,
-                          const int *ipiv,
-                          phi::dtype::complex<double> **Ainv,
-                          int ldc,
-                          int *info,
-                          int batch_size) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasZgetriBatched(
-        handle,
-        n,
-        reinterpret_cast<const cuDoubleComplex **>(A),
-        lda,
-        ipiv,
-        reinterpret_cast<cuDoubleComplex **>(Ainv),
-        ldc,
-        info,
-        batch_size));
-  }
-
-  static void MATINV_BATCH(cublasHandle_t handle,
-                           int n,
-                           const phi::dtype::complex<double> **A,
-                           int lda,
-                           phi::dtype::complex<double> **Ainv,
-                           int lda_inv,
-                           int *info,
-                           int batch_size) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasZmatinvBatched(
-        handle,
-        n,
-        reinterpret_cast<const cuDoubleComplex **>(A),
-        lda,
-        reinterpret_cast<cuDoubleComplex **>(Ainv),
-        lda_inv,
-        info,
-        batch_size));
-  }
-  // &*******************************************新增模版定义*************************
-};
-
-inline void CheckGEMMNSize(int64_t N) {
-  constexpr int64_t kMaxN = 1073741823;
-  if (N > kMaxN) {
-    PADDLE_THROW(common::errors::Unimplemented(
-        "cublas GEMM does not support N > %ld. Got N = %ld. ", kMaxN, N));
-  }
-}
-
-template <>
-template <typename T>
-void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
-                                 CBLAS_TRANSPOSE transB,
-                                 int64_t M,
-                                 int64_t N,
-                                 int64_t K,
-                                 T alpha,
-                                 const T *A,
-                                 const T *B,
-                                 T beta,
-                                 T *C) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int64_t lda = (transA == CblasNoTrans) ? K : M;
-  int64_t ldb = (transB == CblasNoTrans) ? N : K;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-
-#if CUDA_VERSION >= 8000
-  if (FLAGS_enable_cublas_tensor_op_math && std::is_same<T, float>::value) {
-    auto &cuda_ctx = const_cast<phi::GPUContext &>(dev_ctx_);
-    if (M > INT_MAX_VALUE || N > INT_MAX_VALUE || K > INT_MAX_VALUE) {
-#if CUDA_VERSION >= 12030 && defined(__linux__)
-      PADDLE_THROW(common::errors::Unimplemented(
-          "CUBlas<T>::GEMM_EX_64 is not complete"));
-#else
-      PADDLE_THROW(common::errors::Unimplemented(
-          "GEMM_EX_64 is not supported on cuda < 12.3"));
-#endif
-    } else {
-      CheckGEMMNSize(N);
-      CUBlas<T>::GEMM_EX(&cuda_ctx,
-                         cuTransB,
-                         cuTransA,
-                         N,
-                         M,
-                         K,
-                         &alpha,
-                         B,
-                         CUDA_R_32F,
-                         ldb,
-                         A,
-                         CUDA_R_32F,
-                         lda,
-                         &beta,
-                         C,
-                         CUDA_R_32F,
-                         N);
-    }
-  } else {
-#endif  // CUDA_VERSION >= 8000
-    if (M > INT_MAX_VALUE || N > INT_MAX_VALUE || K > INT_MAX_VALUE) {
-      PADDLE_THROW(common::errors::Unimplemented(
-          "GEMM_EX_64 is not supported on cuda < 12.3"));
-    } else {
-      dev_ctx_.CublasCall([&](cublasHandle_t handle) {
-        CUBlas<T>::GEMM(handle,
-                        cuTransB,
-                        cuTransA,
-                        N,
-                        M,
-                        K,
-                        &alpha,
-                        B,
-                        ldb,
-                        A,
-                        lda,
-                        &beta,
-                        C,
-                        N);
-      });
-    }
-
-#if CUDA_VERSION >= 8000
-  }
-#endif  // CUDA_VERSION >= 8000
-}
-
-template <>
-template <>
-inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
-                                        CBLAS_TRANSPOSE transB,
-                                        int64_t M,
-                                        int64_t N,
-                                        int64_t K,
-                                        phi::dtype::float16 alpha,
-                                        const phi::dtype::float16 *A,
-                                        const phi::dtype::float16 *B,
-                                        phi::dtype::float16 beta,
-                                        phi::dtype::float16 *C) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int64_t lda = (transA == CblasNoTrans) ? K : M;
-  int64_t ldb = (transB == CblasNoTrans) ? N : K;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-
-  // TODO(kexinzhao): add processing code for compute capability < 53 case
-  PADDLE_ENFORCE_GE(
-      dev_ctx_.GetComputeCapability(),
-      53,
-      phi::errors::InvalidArgument(
-          "cublas fp16 gemm requires GPU compute capability >= 53,"
-          "but received %d",
-          dev_ctx_.GetComputeCapability()));
-
-  float h_alpha = static_cast<float>(alpha);
-  float h_beta = static_cast<float>(beta);
-
-#if CUDA_VERSION >= 8000
-  // cublasHgemm does true FP16 computation which is slow for non-Volta
-  // GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
-  // input/output in fp16, computation in fp32, which can also be accelerated
-  // using tensor cores in volta GPUs.
-  auto &cuda_ctx = const_cast<phi::GPUContext &>(dev_ctx_);
-  CUBlas<phi::dtype::float16>::GEMM_EX(&cuda_ctx,
-                                       cuTransB,
-                                       cuTransA,
-                                       N,
-                                       M,
-                                       K,
-                                       &h_alpha,
-                                       B,
-                                       CUDA_R_16F,
-                                       ldb,
-                                       A,
-                                       CUDA_R_16F,
-                                       lda,
-                                       &h_beta,
-                                       C,
-                                       CUDA_R_16F,
-                                       N,
-                                       CUBLAS_COMPUTE_32F);
-#else
-  // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
-
-  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
-    CUBlas<phi::dtype::float16>::GEMM(handle,
-                                      cuTransB,
-                                      cuTransA,
-                                      N,
-                                      M,
-                                      K,
-                                      &h_alpha,
-                                      h_B,
-                                      ldb,
-                                      h_A,
-                                      lda,
-                                      &h_beta,
-                                      h_C,
-                                      N);
-  });
-#endif  // CUDA_VERSION >= 8000
-}
-
-template <>
-template <typename T, typename U>
-void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
-                                 CBLAS_TRANSPOSE transB,
-                                 int64_t M,
-                                 int64_t N,
-                                 int64_t K,
-                                 U alpha,
-                                 const T *A,
-                                 const T *B,
-                                 U beta,
-                                 T *C) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int64_t lda = (transA == CblasNoTrans) ? K : M;
-  int64_t ldb = (transB == CblasNoTrans) ? N : K;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-
-  T t_alpha = static_cast<T>(alpha);
-  T t_beta = static_cast<T>(beta);
-
-#if CUDA_VERSION >= 8000
-  if (FLAGS_enable_cublas_tensor_op_math && std::is_same<T, float>::value) {
-    auto &cuda_ctx = const_cast<phi::GPUContext &>(dev_ctx_);
-    if (M > INT_MAX_VALUE || N > INT_MAX_VALUE || K > INT_MAX_VALUE) {
-#if CUDA_VERSION >= 12030 && defined(__linux__)
-      PADDLE_THROW(common::errors::Unimplemented("GEMM_EX_64 is not complete"));
-#else
-      PADDLE_THROW(common::errors::Unimplemented(
-          "GEMM_EX_64 is not supported on cuda < 12.3"));
-#endif
-    } else {
-      CheckGEMMNSize(N);
-      CUBlas<T>::GEMM_EX(&cuda_ctx,
-                         cuTransB,
-                         cuTransA,
-                         static_cast<int>(N),
-                         static_cast<int>(M),
-                         static_cast<int>(K),
-                         &t_alpha,
-                         B,
-                         CUDA_R_32F,
-                         static_cast<int>(ldb),
-                         A,
-                         CUDA_R_32F,
-                         static_cast<int>(lda),
-                         &t_beta,
-                         C,
-                         CUDA_R_32F,
-                         static_cast<int>(N));
-    }
-  } else {
-#endif  // CUDA_VERSION >= 8000
-    if (M > INT_MAX_VALUE || N > INT_MAX_VALUE || K > INT_MAX_VALUE) {
-      PADDLE_THROW(common::errors::Unimplemented(
-          "GEMM_EX_64 is not supported on cuda < 12.3"));
-    } else {
-      dev_ctx_.CublasCall([&](cublasHandle_t handle) {
-        CUBlas<T>::GEMM(handle,
-                        cuTransB,
-                        cuTransA,
-                        static_cast<int>(N),
-                        static_cast<int>(M),
-                        static_cast<int>(K),
-                        &t_alpha,
-                        B,
-                        static_cast<int>(ldb),
-                        A,
-                        static_cast<int>(lda),
-                        &t_beta,
-                        C,
-                        static_cast<int>(N));
-      });
-    }
-
-#if CUDA_VERSION >= 8000
-  }
-#endif  // CUDA_VERSION >= 8000
-}
-
-template <>
-template <>
-inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
-                                        CBLAS_TRANSPOSE transB,
-                                        int64_t M,
-                                        int64_t N,
-                                        int64_t K,
-                                        float alpha,
-                                        const phi::dtype::float16 *A,
-                                        const phi::dtype::float16 *B,
-                                        float beta,
-                                        phi::dtype::float16 *C) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int64_t lda = (transA == CblasNoTrans) ? K : M;
-  int64_t ldb = (transB == CblasNoTrans) ? N : K;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-
-  // TODO(kexinzhao): add processing code for compute capability < 53 case
-  // PADDLE_ENFORCE_GE(
-  //     dev_ctx_.GetComputeCapability(),
-  //     53,
-  //     common::errors::InvalidArgument(
-  //         "cublas fp16 gemm requires GPU compute capability >= 53,"
-  //         "but received %d",
-  //         dev_ctx_.GetComputeCapability()));
-
-  float h_alpha = alpha;
-  float h_beta = beta;
-
-#if CUDA_VERSION >= 8000
-  auto &cuda_ctx = const_cast<phi::GPUContext &>(dev_ctx_);
-#endif
-  // cublasHgemm does true FP16 computation which is slow for non-Volta
-  // GPUs. So use cublasGemmEx instead which does pseudo FP16 computation:
-  // input/output in fp16, computation in fp32, which can also be accelerated
-  // using tensor cores in volta GPUs.
-  if (M > INT_MAX_VALUE || N > INT_MAX_VALUE || K > INT_MAX_VALUE) {
-#if CUDA_VERSION >= 12030 && defined(__linux__)
-    PADDLE_THROW(common::errors::Unimplemented("GEMM_EX_64 is not complete"));
-#else
-    PADDLE_THROW(common::errors::Unimplemented(
-        "GEMM_EX_64 is not supported on cuda < 12.3"));
-#endif  // CUDA_VERSION >= 12030
-  } else {
-#if CUDA_VERSION >= 8000
-    CheckGEMMNSize(N);
-    CUBlas<phi::dtype::float16>::GEMM_EX(&cuda_ctx,
-                                         cuTransB,
-                                         cuTransA,
-                                         static_cast<int>(N),
-                                         static_cast<int>(M),
-                                         static_cast<int>(K),
-                                         &h_alpha,
-                                         B,
-                                         CUDA_R_16F,
-                                         static_cast<int>(ldb),
-                                         A,
-                                         CUDA_R_16F,
-                                         static_cast<int>(lda),
-                                         &h_beta,
-                                         C,
-                                         CUDA_R_16F,
-                                         static_cast<int>(N),
-                                         CUBLAS_COMPUTE_32F);
-#else
-    // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
-    dev_ctx_.CublasCall([&](cublasHandle_t handle) {
-      CUBlas<phi::dtype::float16>::GEMM(handle,
-                                        cuTransB,
-                                        cuTransA,
-                                        static_cast<int>(N),
-                                        static_cast<int>(M),
-                                        static_cast<int>(K),
-                                        &h_alpha,
-                                        h_B,
-                                        static_cast<int>(ldb),
-                                        h_A,
-                                        static_cast<int>(lda),
-                                        &h_beta,
-                                        h_C,
-                                        static_cast<int>(N));
-    });
-#endif  // CUDA_VERSION >= 8000
-  }
-}
-
-template <>
-template <>
-inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
-                                        CBLAS_TRANSPOSE transB,
-                                        int64_t M,
-                                        int64_t N,
-                                        int64_t K,
-                                        phi::dtype::bfloat16 alpha,
-                                        const phi::dtype::bfloat16 *A,
-                                        const phi::dtype::bfloat16 *B,
-                                        phi::dtype::bfloat16 beta,
-                                        phi::dtype::bfloat16 *C) const {
-#if CUDA_VERSION >= 11000
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int64_t lda = (transA == CblasNoTrans) ? K : M;
-  int64_t ldb = (transB == CblasNoTrans) ? N : K;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-
-  PADDLE_ENFORCE_GE(
-      dev_ctx_.GetComputeCapability(),
-      80,
-      phi::errors::InvalidArgument(
-          "cublas bf16 gemm requires GPU compute capability >= 80,"
-          "but received %d",
-          dev_ctx_.GetComputeCapability()));
-
-  float h_alpha = static_cast<float>(alpha);
-  float h_beta = static_cast<float>(beta);
-
-  cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT;
-  bool use_tensor_op_math = dev_ctx_.tensor_core_available();
-  if (use_tensor_op_math) {
-    algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
-  }
-  VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False");
-
-  if (M > INT_MAX_VALUE || N > INT_MAX_VALUE || K > INT_MAX_VALUE) {
-#if CUDA_VERSION >= 12030 && defined(__linux__)
-    PADDLE_THROW(
-        common::errors::Unimplemented("cublasGemmEx_64 is not complete"));
-#else
-    PADDLE_THROW(common::errors::Unimplemented(
-        "cublasGemmEx_64 is not supported on cuda < 12.3"));
-#endif  // CUDA_VERSION >= 12030
-  } else {
-    CheckGEMMNSize(N);
-    dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle,
-                                                            cuTransB,
-                                                            cuTransA,
-                                                            N,
-                                                            M,
-                                                            K,
-                                                            &h_alpha,
-                                                            B,
-                                                            CUDA_R_16BF,
-                                                            ldb,
-                                                            A,
-                                                            CUDA_R_16BF,
-                                                            lda,
-                                                            &h_beta,
-                                                            C,
-                                                            CUDA_R_16BF,
-                                                            N,
-                                                            CUBLAS_COMPUTE_32F,
-                                                            algo));
-    });
-  }
-#else
-  // raise error
-  PADDLE_THROW(phi::errors::Unimplemented(
-      "cublasGemmEx with bfloat16 is not supported on cuda <= 11"));
-
-#endif  // CUDA_VERSION >= 11000
-}
-
-template <>
-template <>
-inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
-                                        CBLAS_TRANSPOSE transB,
-                                        int64_t M,
-                                        int64_t N,
-                                        int64_t K,
-                                        phi::dtype::complex<float> alpha,
-                                        const phi::dtype::complex<float> *A,
-                                        const phi::dtype::complex<float> *B,
-                                        phi::dtype::complex<float> beta,
-                                        phi::dtype::complex<float> *C) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int64_t lda = (transA == CblasNoTrans) ? K : M;
-  int64_t ldb = (transB == CblasNoTrans) ? N : K;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-
-  // TODO(kexinzhao): add processing code for compute capability < 53 case
-  PADDLE_ENFORCE_GE(
-      dev_ctx_.GetComputeCapability(),
-      53,
-      phi::errors::InvalidArgument(
-          "cublas complex64 gemm requires GPU compute capability >= 53,"
-          "but received %d",
-          dev_ctx_.GetComputeCapability()));
-
-  thrust::complex<float> c_alpha =
-      thrust::complex<float>(alpha.real, alpha.imag);
-  thrust::complex<float> c_beta = thrust::complex<float>(beta.real, beta.imag);
-
-#if CUDA_VERSION >= 8000
-  auto &cuda_ctx = const_cast<phi::GPUContext &>(dev_ctx_);
-#endif
-
-  if (M > INT_MAX_VALUE || N > INT_MAX_VALUE || K > INT_MAX_VALUE) {
-#if CUDA_VERSION >= 12030 && defined(__linux__)
-    PADDLE_THROW(common::errors::Unimplemented("GEMM_EX_64 is not complete"));
-#else
-    PADDLE_THROW(common::errors::Unimplemented(
-        "GEMM_EX_64 is not supported on cuda < 12.3"));
-#endif  // CUDA_VERSION >= 12030
-  } else {
-#if CUDA_VERSION >= 8000
-    CheckGEMMNSize(N);
-    CUBlas<phi::dtype::complex<float>>::GEMM_EX(&cuda_ctx,
-                                                cuTransB,
-                                                cuTransA,
-                                                static_cast<int>(N),
-                                                static_cast<int>(M),
-                                                static_cast<int>(K),
-                                                &c_alpha,
-                                                B,
-                                                CUDA_C_32F,
-                                                static_cast<int>(ldb),
-                                                A,
-                                                CUDA_C_32F,
-                                                static_cast<int>(lda),
-                                                &c_beta,
-                                                C,
-                                                CUDA_C_32F,
-                                                static_cast<int>(N),
-                                                CUBLAS_COMPUTE_32F);
-#else
-    // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
-
-    dev_ctx_.CublasCall([&](cublasHandle_t handle) {
-      CUBlas<phi::dtype::complex<float>>::GEMM(handle,
-                                               cuTransB,
-                                               cuTransA,
-                                               static_cast<int>(N),
-                                               static_cast<int>(M),
-                                               static_cast<int>(K),
-                                               &c_alpha,
-                                               h_B,
-                                               static_cast<int>(ldb),
-                                               h_A,
-                                               static_cast<int>(lda),
-                                               &c_beta,
-                                               h_C,
-                                               static_cast<int>(N));
-    });
-#endif  // CUDA_VERSION >= 8000
-  }
-}
-
-template <>
-template <>
-inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
-                                        CBLAS_TRANSPOSE transB,
-                                        int64_t M,
-                                        int64_t N,
-                                        int64_t K,
-                                        phi::dtype::complex<double> alpha,
-                                        const phi::dtype::complex<double> *A,
-                                        const phi::dtype::complex<double> *B,
-                                        phi::dtype::complex<double> beta,
-                                        phi::dtype::complex<double> *C) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int64_t lda = (transA == CblasNoTrans) ? K : M;
-  int64_t ldb = (transB == CblasNoTrans) ? N : K;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-
-  // TODO(kexinzhao): add processing code for compute capability < 53 case
-  PADDLE_ENFORCE_GE(
-      dev_ctx_.GetComputeCapability(),
-      53,
-      phi::errors::InvalidArgument(
-          "cublas complex128 gemm requires GPU compute capability >= 53,"
-          "but received %d",
-          dev_ctx_.GetComputeCapability()));
-
-  thrust::complex<double> c_alpha =
-      thrust::complex<double>(alpha.real, alpha.imag);
-  thrust::complex<double> c_beta =
-      thrust::complex<double>(beta.real, beta.imag);
-
-#if CUDA_VERSION >= 8000
-  auto &cuda_ctx = const_cast<phi::GPUContext &>(dev_ctx_);
-#endif
-
-  if (M > INT_MAX_VALUE || N > INT_MAX_VALUE || K > INT_MAX_VALUE) {
-#if CUDA_VERSION >= 12030 && defined(__linux__)
-    PADDLE_THROW(common::errors::Unimplemented("GEMM_EX_64 is not complete"));
-#else
-    PADDLE_THROW(common::errors::Unimplemented(
-        "GEMM_EX_64 is not supported on cuda < 12.3"));
-#endif  // CUDA_VERSION >= 12030
-  } else {
-#if CUDA_VERSION >= 8000
-    CheckGEMMNSize(N);
-    CUBlas<phi::dtype::complex<double>>::GEMM_EX(&cuda_ctx,
-                                                 cuTransB,
-                                                 cuTransA,
-                                                 static_cast<int>(N),
-                                                 static_cast<int>(M),
-                                                 static_cast<int>(K),
-                                                 &c_alpha,
-                                                 B,
-                                                 CUDA_C_64F,
-                                                 static_cast<int>(ldb),
-                                                 A,
-                                                 CUDA_C_64F,
-                                                 static_cast<int>(lda),
-                                                 &c_beta,
-                                                 C,
-                                                 CUDA_C_64F,
-                                                 static_cast<int>(N),
-                                                 CUBLAS_COMPUTE_64F);
-#else
-    // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
-
-    dev_ctx_.CublasCall([&](cublasHandle_t handle) {
-      CUBlas<phi::dtype::complex<double>>::GEMM(handle,
-                                                cuTransB,
-                                                cuTransA,
-                                                static_cast<int>(N),
-                                                static_cast<int>(M),
-                                                static_cast<int>(K),
-                                                &c_alpha,
-                                                h_B,
-                                                static_cast<int>(ldb),
-                                                h_A,
-                                                static_cast<int>(lda),
-                                                &c_beta,
-                                                h_C,
-                                                static_cast<int>(N));
-    });
-#endif  // CUDA_VERSION >= 8000
-  }
-}
-
-template <>
-template <>
-inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
-                                        CBLAS_TRANSPOSE transB,
-                                        int64_t M,
-                                        int64_t N,
-                                        int64_t K,
-                                        float alpha,
-                                        const phi::dtype::bfloat16 *A,
-                                        const phi::dtype::bfloat16 *B,
-                                        float beta,
-                                        phi::dtype::bfloat16 *C) const {
-#if CUDA_VERSION >= 11000
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int64_t lda = (transA == CblasNoTrans) ? K : M;
-  int64_t ldb = (transB == CblasNoTrans) ? N : K;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-
-  // PADDLE_ENFORCE_GE(
-  //     dev_ctx_.GetComputeCapability(),
-  //     80,
-  //     common::errors::InvalidArgument(
-  //         "cublas bf16 gemm requires GPU compute capability >= 80,"
-  //         "but received %d",
-  //         dev_ctx_.GetComputeCapability()));
-
-  float h_alpha = alpha;
-  float h_beta = beta;
-
-  cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT;
-  bool use_tensor_op_math = dev_ctx_.tensor_core_available();
-  if (use_tensor_op_math) {
-    algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
-  }
-  VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False");
-  if (M > INT_MAX_VALUE || N > INT_MAX_VALUE || K > INT_MAX_VALUE) {
-#if CUDA_VERSION >= 12030 && defined(__linux__)
-    PADDLE_THROW(
-        common::errors::Unimplemented("cublasGemmEx_64 is not complete"));
-#else
-    PADDLE_THROW(common::errors::Unimplemented(
-        "cublasGemmEx_64 is not supported on cuda < 12.3"));
-#endif  // CUDA_VERSION >= 12030
-  } else {
-    CheckGEMMNSize(N);
-    dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::cublasGemmEx(handle,
-                                     cuTransB,
-                                     cuTransA,
-                                     static_cast<int>(N),
-                                     static_cast<int>(M),
-                                     static_cast<int>(K),
-                                     &h_alpha,
-                                     B,
-                                     CUDA_R_16BF,
-                                     static_cast<int>(ldb),
-                                     A,
-                                     CUDA_R_16BF,
-                                     static_cast<int>(lda),
-                                     &h_beta,
-                                     C,
-                                     CUDA_R_16BF,
-                                     static_cast<int>(N),
-                                     CUDA_R_32F,
-                                     algo));
-    });
-  }
-#else
-  // raise error
-  PADDLE_THROW(common::errors::Unimplemented(
-      "cublasGemmEx with bfloat16 is not supported on cuda <= 11"));
-
-#endif  // CUDA_VERSION >= 11000
-}
-
-template <>
-template <typename T>
-void Blas<phi::GPUContext>::GEMM(bool transA,
-                                 bool transB,
-                                 int M,
-                                 int N,
-                                 int K,
-                                 T alpha,
-                                 const T *A,
-                                 int lda,
-                                 const T *B,
-                                 int ldb,
-                                 T beta,
-                                 T *C,
-                                 int ldc) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
-  cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
-
-#if CUDA_VERSION >= 8000
-  if (FLAGS_enable_cublas_tensor_op_math && std::is_same<T, float>::value) {
-    auto &cuda_ctx = const_cast<phi::GPUContext &>(dev_ctx_);
-    CUBlas<T>::GEMM_EX(&cuda_ctx,
-                       cuTransB,
-                       cuTransA,
-                       N,
-                       M,
-                       K,
-                       &alpha,
-                       B,
-                       CUDA_R_32F,
-                       ldb,
-                       A,
-                       CUDA_R_32F,
-                       lda,
-                       &beta,
-                       C,
-                       CUDA_R_32F,
-                       ldc);
-  } else {
-#endif  // CUDA_VERSION >= 8000
-
-    dev_ctx_.CublasCall([&](cublasHandle_t handle) {
-      CUBlas<T>::GEMM(handle,
-                      cuTransB,
-                      cuTransA,
-                      N,
-                      M,
-                      K,
-                      &alpha,
-                      B,
-                      ldb,
-                      A,
-                      lda,
-                      &beta,
-                      C,
-                      ldc);
-    });
-
-#if CUDA_VERSION >= 8000
-  }
-#endif  // CUDA_VERSION >= 8000
-}
-
-template <>
-template <>
-inline void Blas<phi::GPUContext>::GEMM(bool transA,
-                                        bool transB,
-                                        int M,
-                                        int N,
-                                        int K,
-                                        phi::dtype::float16 alpha,
-                                        const phi::dtype::float16 *A,
-                                        int lda,
-                                        const phi::dtype::float16 *B,
-                                        int ldb,
-                                        phi::dtype::float16 beta,
-                                        phi::dtype::float16 *C,
-                                        int ldc) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
-  cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
-
-  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
-    CUBlas<phi::dtype::float16>::GEMM(handle,
-                                      cuTransB,
-                                      cuTransA,
-                                      N,
-                                      M,
-                                      K,
-                                      &alpha,
-                                      B,
-                                      ldb,
-                                      A,
-                                      lda,
-                                      &beta,
-                                      C,
-                                      ldc);
-  });
-}
-
-template <>
-template <>
-inline void Blas<phi::GPUContext>::GEMM(bool transA,
-                                        bool transB,
-                                        int M,
-                                        int N,
-                                        int K,
-                                        phi::dtype::bfloat16 alpha,
-                                        const phi::dtype::bfloat16 *A,
-                                        int lda,
-                                        const phi::dtype::bfloat16 *B,
-                                        int ldb,
-                                        phi::dtype::bfloat16 beta,
-                                        phi::dtype::bfloat16 *C,
-                                        int ldc) const {
-#if CUDA_VERSION >= 11000
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
-  cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
-
-  // PADDLE_ENFORCE_GE(
-  //     dev_ctx_.GetComputeCapability(),
-  //     80,
-  //     phi::errors::InvalidArgument(
-  //         "cublas bf16 gemm requires GPU compute capability >= 80,"
-  //         "but received %d",
-  //         dev_ctx_.GetComputeCapability()));
-
-  float h_alpha = static_cast<float>(alpha);
-  float h_beta = static_cast<float>(beta);
-
-  cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT;
-  bool use_tensor_op_math = dev_ctx_.tensor_core_available();
-  if (use_tensor_op_math) {
-    algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
-  }
-  VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False");
-
-  dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle,
-                                                          cuTransB,
-                                                          cuTransA,
-                                                          N,
-                                                          M,
-                                                          K,
-                                                          &h_alpha,
-                                                          B,
-                                                          CUDA_R_16BF,
-                                                          ldb,
-                                                          A,
-                                                          CUDA_R_16BF,
-                                                          lda,
-                                                          &h_beta,
-                                                          C,
-                                                          CUDA_R_16BF,
-                                                          ldc,
-                                                          CUBLAS_COMPUTE_32F,
-                                                          algo));
-  });
-#else
-  // raise error
-  PADDLE_THROW(phi::errors::Unimplemented(
-      "cublasGemmEx with bfloat16 is not supported on cuda <= 11"));
-
-#endif  // CUDA_VERSION >= 11000
-}
-
-template <>
-template <typename T>
-void Blas<phi::GPUContext>::AXPY(int n, T alpha, const T *x, T *y) const {
-  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
-    CUBlas<T>::AXPY(handle, n, &alpha, x, 1, y, 1);
-  });
-}
-
-template <>
-template <typename T>
-void Blas<phi::GPUContext>::SCAL(int n, const T alpha, T *x) const {
-  dev_ctx_.CublasCall(
-      [&](cublasHandle_t handle) { CUBlas<T>::SCAL(handle, n, &alpha, x, 1); });
-}
-
-template <>
-template <typename T>
-void Blas<phi::GPUContext>::VCOPY(int n, const T *x, T *y) const {
-  dev_ctx_.CublasCall(
-      [&](cublasHandle_t handle) { CUBlas<T>::VCOPY(handle, n, x, 1, y, 1); });
-}
-
-template <>
-template <typename T>
-void Blas<phi::GPUContext>::GEMV(bool trans_a,
-                                 int M,
-                                 int N,
-                                 T alpha,
-                                 const T *A,
-                                 const T *B,
-                                 T beta,
-                                 T *C) const {
-  cublasOperation_t cuTransA = !trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
-
-  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
-    CUBlas<T>::GEMV(handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1);
-  });
-}
-
-template <>
-template <>
-inline void Blas<phi::GPUContext>::GEMV(bool trans_a,
-                                        int M,
-                                        int N,
-                                        phi::dtype::float16 alpha,
-                                        const phi::dtype::float16 *A,
-                                        const phi::dtype::float16 *B,
-                                        phi::dtype::float16 beta,
-                                        phi::dtype::float16 *C) const {
-  // Because cublas doesn't support half gemv, we use cublasHgemm to achieve it.
-  if (trans_a) {
-    this->template GEMM<phi::dtype::float16>(
-        CblasNoTrans, CblasNoTrans, 1, N, M, alpha, B, A, beta, C);
-  } else {
-    this->template GEMM<phi::dtype::float16>(
-        CblasNoTrans, CblasNoTrans, M, 1, N, alpha, A, B, beta, C);
-  }
-}
-
-template <>
-template <>
-inline void Blas<phi::GPUContext>::GEMV(bool trans_a,
-                                        int M,
-                                        int N,
-                                        phi::dtype::bfloat16 alpha,
-                                        const phi::dtype::bfloat16 *A,
-                                        const phi::dtype::bfloat16 *B,
-                                        phi::dtype::bfloat16 beta,
-                                        phi::dtype::bfloat16 *C) const {
-  // Because cublas doesn't support bfloat gemv, we use cublasHgemm to achieve
-  // it.
-  if (trans_a) {
-    this->template GEMM<phi::dtype::bfloat16>(
-        CblasNoTrans, CblasNoTrans, 1, N, M, alpha, B, A, beta, C);
-  } else {
-    this->template GEMM<phi::dtype::bfloat16>(
-        CblasNoTrans, CblasNoTrans, M, 1, N, alpha, A, B, beta, C);
-  }
-}
-
-template <>
-template <typename T>
-void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
-                                        CBLAS_TRANSPOSE transB,
-                                        int64_t M,
-                                        int64_t N,
-                                        int64_t K,
-                                        T alpha,
-                                        const T *A,
-                                        const T *B,
-                                        T beta,
-                                        T *C,
-                                        int64_t batchCount,
-                                        int64_t strideA,
-                                        int64_t strideB) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int64_t lda = (transA == CblasNoTrans) ? K : M;
-  int64_t ldb = (transB == CblasNoTrans) ? N : K;
-  int64_t ldc = N;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  const int64_t strideC = M * N;
-
-#if CUDA_VERSION >= 9010
-  if ((FLAGS_enable_cublas_tensor_op_math && (std::is_same<T, float>::value)) ||
-      std::is_same<T, phi::dtype::float16>::value) {
-    cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-    bool use_tensor_op_math = dev_ctx_.tensor_core_available();
-    if (use_tensor_op_math) {
-      algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
-    }
-    VLOG(5) << "use_tensor_op_math: "
-            << (use_tensor_op_math ? "True" : "False");
-    VLOG(4) << "use_half_precision_compute_type: "
-            << FLAGS_gemm_use_half_precision_compute_type;
-
-    auto fp = std::is_same<T, float>::value ? CUDA_R_32F : CUDA_R_16F;
-#if CUDA_VERSION >= 11000
-    auto compute_type = CUBLAS_COMPUTE_32F;
-#else
-    auto compute_type = CUDA_R_32F;
-#endif
-
-    float h_alpha = static_cast<float>(alpha);
-    float h_beta = static_cast<float>(beta);
-    void *a = static_cast<void *>(&h_alpha);
-    void *b = static_cast<void *>(&h_beta);
-    // set ComputeType as CUDA_R_32F for fp16, for better accuracy
-    if (FLAGS_gemm_use_half_precision_compute_type == true &&
-        std::is_same<T, phi::dtype::float16>::value) {
-      a = static_cast<void *>(&alpha);
-      b = static_cast<void *>(&beta);
-#if CUDA_VERSION >= 11000
-      compute_type = CUBLAS_COMPUTE_16F;
-#else
-      compute_type = CUDA_R_16F;
-#endif
-    }
-
-    if (M > INT_MAX_VALUE || N > INT_MAX_VALUE || K > INT_MAX_VALUE) {
-#if CUDA_VERSION >= 12030 && defined(__linux__)
-      PADDLE_THROW(common::errors::Unimplemented(
-          "cublasGemmStridedBatchedEx_64 is not complete"));
-#else
-      PADDLE_THROW(common::errors::Unimplemented(
-          "cublasGemmStridedBatchedEx_64 is not supported on cuda < 12.3"));
-#endif  // CUDA_VERSION >= 12030
-    } else {
-      dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::cublasGemmStridedBatchedEx(handle,
-                                                     cuTransB,
-                                                     cuTransA,
-                                                     N,
-                                                     M,
-                                                     K,
-                                                     a,
-                                                     B,
-                                                     fp,
-                                                     ldb,
-                                                     strideB,
-                                                     A,
-                                                     fp,
-                                                     lda,
-                                                     strideA,
-                                                     b,
-                                                     C,
-                                                     fp,
-                                                     ldc,
-                                                     strideC,
-                                                     batchCount,
-                                                     compute_type,
-                                                     algo));
-      });
-    }
-  } else {
-#endif  // CUDA_VERSION >= 9010
-
-    dev_ctx_.CublasCall([&](cublasHandle_t handle) {
-      CUBlas<T>::GEMM_STRIDED_BATCH(handle,
-                                    cuTransB,
-                                    cuTransA,
-                                    static_cast<int>(N),
-                                    static_cast<int>(M),
-                                    static_cast<int>(K),
-                                    &alpha,
-                                    B,
-                                    static_cast<int>(ldb),
-                                    strideB,
-                                    A,
-                                    static_cast<int>(lda),
-                                    strideA,
-                                    &beta,
-                                    C,
-                                    ldc,
-                                    strideC,
-                                    static_cast<int>(batchCount));
-    });
-
-#if CUDA_VERSION >= 9010
-  }
-#endif  // CUDA_VERSION >= 9010
-}
-
-template <>
-template <typename T, typename U>
-void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
-                                        CBLAS_TRANSPOSE transB,
-                                        int64_t M,
-                                        int64_t N,
-                                        int64_t K,
-                                        U alpha,
-                                        const T *A,
-                                        const T *B,
-                                        U beta,
-                                        T *C,
-                                        int64_t batchCount,
-                                        int64_t strideA,
-                                        int64_t strideB) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int64_t lda = (transA == CblasNoTrans) ? K : M;
-  int64_t ldb = (transB == CblasNoTrans) ? N : K;
-  int64_t ldc = N;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  const int64_t strideC = M * N;
-#if CUDA_VERSION >= 9010
-  if ((FLAGS_enable_cublas_tensor_op_math && (std::is_same<T, float>::value)) ||
-      std::is_same<T, phi::dtype::float16>::value) {
-    cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-    bool use_tensor_op_math = dev_ctx_.tensor_core_available();
-    if (use_tensor_op_math) {
-      algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
-    }
-    VLOG(5) << "use_tensor_op_math: "
-            << (use_tensor_op_math ? "True" : "False");
-    VLOG(4) << "use_half_precision_compute_type: "
-            << FLAGS_gemm_use_half_precision_compute_type;
-
-    auto fp = std::is_same<T, float>::value ? CUDA_R_32F : CUDA_R_16F;
-#if CUDA_VERSION >= 11000
-    auto compute_type = CUBLAS_COMPUTE_32F;
-#else
-    auto compute_type = CUDA_R_32F;
-#endif
-
-    float h_alpha = static_cast<float>(alpha);
-    float h_beta = static_cast<float>(beta);
-    void *a = static_cast<void *>(&h_alpha);
-    void *b = static_cast<void *>(&h_beta);
-    // set ComputeType as CUDA_R_32F for fp16, for better accuracy
-    if (FLAGS_gemm_use_half_precision_compute_type == true &&
-        std::is_same<T, phi::dtype::float16>::value) {
-      a = static_cast<void *>(&alpha);
-      b = static_cast<void *>(&beta);
-#if CUDA_VERSION >= 11000
-      compute_type = CUBLAS_COMPUTE_16F;
-#else
-      compute_type = CUDA_R_16F;
-#endif
-    }
-
-    if (M > INT_MAX_VALUE || N > INT_MAX_VALUE || K > INT_MAX_VALUE ||
-        batchCount > INT_MAX_VALUE) {
-#if CUDA_VERSION >= 12030 && defined(__linux__)
-      PADDLE_THROW(common::errors::Unimplemented(
-          "cublasGemmStridedBatchedEx_64 is not complete"));
-#else
-      PADDLE_THROW(common::errors::Unimplemented(
-          "cublasGemmStridedBatchedEx_64 is not supported on cuda < 12.3"));
-#endif  // CUDA_VERSION >= 12030
-    } else {
-      dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-        PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmStridedBatchedEx(
-            handle,
-            cuTransB,
-            cuTransA,
-            static_cast<int>(N),
-            static_cast<int>(M),
-            static_cast<int>(K),
-            a,
-            B,
-            fp,
-            static_cast<int>(ldb),
-            strideB,
-            A,
-            fp,
-            static_cast<int>(lda),
-            strideA,
-            b,
-            C,
-            fp,
-            static_cast<int>(ldc),
-            strideC,
-            static_cast<int>(batchCount),
-            compute_type,
-            algo));
-      });
-    }
-  } else {
-#endif  // CUDA_VERSION >= 9010
-    T h_alpha = static_cast<T>(alpha);
-    T h_beta = static_cast<T>(beta);
-    dev_ctx_.CublasCall([&](cublasHandle_t handle) {
-      CUBlas<T>::GEMM_STRIDED_BATCH(handle,
-                                    cuTransB,
-                                    cuTransA,
-                                    static_cast<int>(N),
-                                    static_cast<int>(M),
-                                    static_cast<int>(K),
-                                    &h_alpha,
-                                    B,
-                                    static_cast<int>(ldb),
-                                    strideB,
-                                    A,
-                                    static_cast<int>(lda),
-                                    strideA,
-                                    &h_beta,
-                                    C,
-                                    static_cast<int>(ldc),
-                                    strideC,
-                                    static_cast<int>(batchCount));
-    });
-
-#if CUDA_VERSION >= 9010
-  }
-#endif  // CUDA_VERSION >= 9010
-}
-
-template <>
-template <>
-inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
-                                               CBLAS_TRANSPOSE transB,
-                                               int64_t M,
-                                               int64_t N,
-                                               int64_t K,
-                                               phi::dtype::bfloat16 alpha,
-                                               const phi::dtype::bfloat16 *A,
-                                               const phi::dtype::bfloat16 *B,
-                                               phi::dtype::bfloat16 beta,
-                                               phi::dtype::bfloat16 *C,
-                                               int64_t batchCount,
-                                               int64_t strideA,
-                                               int64_t strideB) const {
-#if CUDA_VERSION >= 11000
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int64_t lda = (transA == CblasNoTrans) ? K : M;
-  int64_t ldb = (transB == CblasNoTrans) ? N : K;
-  int64_t ldc = N;
-
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  const int64_t strideC = M * N;
-
-  float h_alpha = static_cast<float>(alpha);
-  float h_beta = static_cast<float>(beta);
-
-  cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-  bool use_tensor_op_math = dev_ctx_.tensor_core_available();
-  if (use_tensor_op_math) {
-    algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
-  }
-  VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False");
-  if (M > INT_MAX_VALUE || N > INT_MAX_VALUE || K > INT_MAX_VALUE ||
-      batchCount > INT_MAX_VALUE) {
-#if CUDA_VERSION >= 12030 && defined(__linux__)
-    PADDLE_THROW(common::errors::Unimplemented(
-        "cublasGemmStridedBatchedEx_64 is not complete"));
-#else
-    PADDLE_THROW(common::errors::Unimplemented(
-        "cublasGemmStridedBatchedEx_64 is not supported on cuda < 12.3"));
-#endif  // CUDA_VERSION >= 12030
-  } else {
-    dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::cublasGemmStridedBatchedEx(handle,
-                                                   cuTransB,
-                                                   cuTransA,
-                                                   static_cast<int>(N),
-                                                   static_cast<int>(M),
-                                                   static_cast<int>(K),
-                                                   &h_alpha,
-                                                   B,
-                                                   CUDA_R_16BF,
-                                                   static_cast<int>(ldb),
-                                                   strideB,
-                                                   A,
-                                                   CUDA_R_16BF,
-                                                   static_cast<int>(lda),
-                                                   strideA,
-                                                   &h_beta,
-                                                   C,
-                                                   CUDA_R_16BF,
-                                                   static_cast<int>(ldc),
-                                                   strideC,
-                                                   static_cast<int>(batchCount),
-                                                   CUBLAS_COMPUTE_32F,
-                                                   algo));
-    });
-  }
-#else
-  // raise error
-  PADDLE_THROW(common::errors::Unimplemented(
-      "cublasGemmStridedBatchedEx with bfloat16 is not supported on cuda <= "
-      "11"));
-#endif  // CUDA_VERSION >= 11000
-}
-
-template <>
-template <>
-inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
-                                               CBLAS_TRANSPOSE transB,
-                                               int64_t M,
-                                               int64_t N,
-                                               int64_t K,
-                                               float alpha,
-                                               const phi::dtype::bfloat16 *A,
-                                               const phi::dtype::bfloat16 *B,
-                                               float beta,
-                                               phi::dtype::bfloat16 *C,
-                                               int64_t batchCount,
-                                               int64_t strideA,
-                                               int64_t strideB) const {
-#if CUDA_VERSION >= 11000
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  const int64_t strideC = M * N;
-
-  float h_alpha = alpha;
-  float h_beta = beta;
-
-  cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-  bool use_tensor_op_math = dev_ctx_.tensor_core_available();
-  if (use_tensor_op_math) {
-    algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
-  }
-  VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False");
-  if (M > INT_MAX_VALUE || N > INT_MAX_VALUE || K > INT_MAX_VALUE ||
-      batchCount > INT_MAX_VALUE) {
-#if CUDA_VERSION >= 12030 && defined(__linux__)
-    PADDLE_THROW(common::errors::Unimplemented(
-        "cublasGemmStridedBatchedEx_64 is not complete"));
-#else
-    PADDLE_THROW(common::errors::Unimplemented(
-        "cublasGemmStridedBatchedEx_64 is not supported on cuda < 12.3"));
-#endif  // CUDA_VERSION >= 12030
-  } else {
-    dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::cublasGemmStridedBatchedEx(handle,
-                                                   cuTransB,
-                                                   cuTransA,
-                                                   static_cast<int>(N),
-                                                   static_cast<int>(M),
-                                                   static_cast<int>(K),
-                                                   &h_alpha,
-                                                   B,
-                                                   CUDA_R_16BF,
-                                                   static_cast<int>(ldb),
-                                                   strideB,
-                                                   A,
-                                                   CUDA_R_16BF,
-                                                   static_cast<int>(lda),
-                                                   strideA,
-                                                   &h_beta,
-                                                   C,
-                                                   CUDA_R_16BF,
-                                                   static_cast<int>(ldc),
-                                                   strideC,
-                                                   static_cast<int>(batchCount),
-                                                   CUBLAS_COMPUTE_32F,
-                                                   algo));
-    });
-  }
-#else
-  // raise error
-  PADDLE_THROW(common::errors::Unimplemented(
-      "cublasGemmStridedBatchedEx with bfloat16 is not supported on cuda <= "
-      "11"));
-#endif  // CUDA_VERSION >= 11000
-}
-
-// /***
-//  * Uknow bug, parameters dislocation when calling BatchedGEMM<float16>.
-//  * Reference: paddle github PR #45530 and #55612
-//  */
-// template <>
-// template <>
-// inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
-//                                                CBLAS_TRANSPOSE transB,
-//                                                int M,
-//                                                int N,
-//                                                int K,
-//                                                float16 alpha,
-//                                                const float16 *A,
-//                                                const float16 *B,
-//                                                float16 beta,
-//                                                float16 *C,
-//                                                int batchCount,
-//                                                int64_t strideA,
-//                                                int64_t strideB) const {
-//   // Note that cublas follows fortran order, so the order is different from
-//   // the cblas convention.
-//   int lda = (transA == CblasNoTrans) ? K : M;
-//   int ldb = (transB == CblasNoTrans) ? N : K;
-//   int ldc = N;
-//   cublasOperation_t cuTransA =
-//       (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-//   cublasOperation_t cuTransB =
-//       (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-//   const int64_t strideC = M * N;
-
-// #if CUDA_VERSION >= 9010
-//   if ((FLAGS_enable_cublas_tensor_op_math &&
-//        (std::is_same<float16, float>::value)) ||
-//       std::is_same<float16, phi::dtype::float16>::value) {
-//     cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-//     bool use_tensor_op_math = dev_ctx_.tensor_core_available();
-//     if (use_tensor_op_math) {
-//       algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
-//     }
-//     VLOG(5) << "use_tensor_op_math: "
-//             << (use_tensor_op_math ? "True" : "False");
-//     VLOG(4) << "use_half_precision_compute_type: "
-//             << FLAGS_gemm_use_half_precision_compute_type;
-
-//     auto fp = std::is_same<float16, float>::value ? CUDA_R_32F : CUDA_R_16F;
-// #if CUDA_VERSION >= 11000
-//     auto compute_type = CUBLAS_COMPUTE_32F;
-// #else
-//     auto compute_type = CUDA_R_32F;
-// #endif
-
-//     float h_alpha = static_cast<float>(alpha);
-//     float h_beta = static_cast<float>(beta);
-//     void *a = static_cast<void *>(&h_alpha);
-//     void *b = static_cast<void *>(&h_beta);
-//     // set ComputeType as CUDA_R_32F for fp16, for better accuracy
-//     if (FLAGS_gemm_use_half_precision_compute_type == true &&
-//         std::is_same<float16, phi::dtype::float16>::value) {
-//       a = static_cast<void *>(&alpha);
-//       b = static_cast<void *>(&beta);
-// #if CUDA_VERSION >= 11000
-//       compute_type = CUBLAS_COMPUTE_16F;
-// #else
-//       compute_type = CUDA_R_16F;
-// #endif
-//     }
-
-//     dev_ctx_.TensorCoreCublasCallIfAvailable(
-//         [&](cublasHandle_t handle) {
-//           PADDLE_ENFORCE_GPU_SUCCESS(
-//               phi::dynload::cublasGemmStridedBatchedEx(handle,
-//                                                        cuTransB,
-//                                                        cuTransA,
-//                                                        N,
-//                                                        M,
-//                                                        K,
-//                                                        a,
-//                                                        B,
-//                                                        fp,
-//                                                        ldb,
-//                                                        strideB,
-//                                                        A,
-//                                                        fp,
-//                                                        lda,
-//                                                        strideA,
-//                                                        b,
-//                                                        C,
-//                                                        fp,
-//                                                        ldc,
-//                                                        strideC,
-//                                                        batchCount,
-//                                                        compute_type,
-//                                                        algo));
-//         });
-//   } else {
-// #endif  // CUDA_VERSION >= 9010
-
-//     dev_ctx_.CublasCall(
-//         [&](cublasHandle_t handle) {
-//           CUBlas<float16>::GEMM_STRIDED_BATCH(handle,
-//                                               cuTransB,
-//                                               cuTransA,
-//                                               N,
-//                                               M,
-//                                               K,
-//                                               &alpha,
-//                                               B,
-//                                               ldb,
-//                                               strideB,
-//                                               A,
-//                                               lda,
-//                                               strideA,
-//                                               &beta,
-//                                               C,
-//                                               ldc,
-//                                               strideC,
-//                                               batchCount);
-//         },
-//         dev_ctx_.stream());
-
-// #if CUDA_VERSION >= 9010
-//   }
-// #endif  // CUDA_VERSION >= 9010
-// }
-
-// /***
-//  * Uknow bug, parameters dislocation when calling BatchedGEMM<double>.
-//  * Reference: paddle github PR #45530 and #55612
-//  */
-// template <>
-// template <>
-// inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
-//                                                CBLAS_TRANSPOSE transB,
-//                                                int M,
-//                                                int N,
-//                                                int K,
-//                                                double alpha,
-//                                                const double *A,
-//                                                const double *B,
-//                                                double beta,
-//                                                double *C,
-//                                                int batchCount,
-//                                                int64_t strideA,
-//                                                int64_t strideB) const {
-//   // Note that cublas follows fortran order, so the order is different from
-//   // the cblas convention.
-//   int lda = (transA == CblasNoTrans) ? K : M;
-//   int ldb = (transB == CblasNoTrans) ? N : K;
-//   int ldc = N;
-//   cublasOperation_t cuTransA =
-//       (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-//   cublasOperation_t cuTransB =
-//       (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-//   const int64_t strideC = M * N;
-//   dev_ctx_.CublasCall(
-//       [&](cublasHandle_t handle) {
-//         PADDLE_ENFORCE_GPU_SUCCESS(
-//             phi::dynload::cublasDgemmStridedBatched(handle,
-//                                                     cuTransB,
-//                                                     cuTransA,
-//                                                     N,
-//                                                     M,
-//                                                     K,
-//                                                     &alpha,
-//                                                     B,
-//                                                     ldb,
-//                                                     strideB,
-//                                                     A,
-//                                                     lda,
-//                                                     strideA,
-//                                                     &beta,
-//                                                     C,
-//                                                     ldc,
-//                                                     strideC,
-//                                                     batchCount));
-//       },
-//       dev_ctx_.stream());
-// }
-
-// template <>
-// template <>
-// inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
-//                                                CBLAS_TRANSPOSE transB,
-//                                                int M,
-//                                                int N,
-//                                                int K,
-//                                                phi::dtype::bfloat16 alpha,
-//                                                const phi::dtype::bfloat16 *A,
-//                                                const phi::dtype::bfloat16 *B,
-//                                                phi::dtype::bfloat16 beta,
-//                                                phi::dtype::bfloat16 *C,
-//                                                int batchCount,
-//                                                int64_t strideA,
-//                                                int64_t strideB) const {
-// #if CUDA_VERSION >= 11000
-//   // Note that cublas follows fortran order, so the order is different from
-//   // the cblas convention.
-//   int lda = (transA == CblasNoTrans) ? K : M;
-//   int ldb = (transB == CblasNoTrans) ? N : K;
-//   int ldc = N;
-//   cublasOperation_t cuTransA =
-//       (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-//   cublasOperation_t cuTransB =
-//       (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-//   const int64_t strideC = M * N;
-
-//   float h_alpha = static_cast<float>(alpha);
-//   float h_beta = static_cast<float>(beta);
-
-//   cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-//   bool use_tensor_op_math = dev_ctx->tensor_core_available();
-//   if (use_tensor_op_math) {
-//     algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
-//   }
-//   VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" :
-//   "False");
-
-//   dev_ctx_.TensorCoreCublasCallIfAvailable(
-//       [&](cublasHandle_t handle) {
-//         PADDLE_ENFORCE_GPU_SUCCESS(
-//             phi::dynload::cublasGemmStridedBatchedEx(handle,
-//                                                      cuTransB,
-//                                                      cuTransA,
-//                                                      N,
-//                                                      M,
-//                                                      K,
-//                                                      &h_alpha,
-//                                                      B,
-//                                                      CUDA_R_16BF,
-//                                                      ldb,
-//                                                      strideB,
-//                                                      A,
-//                                                      CUDA_R_16BF,
-//                                                      lda,
-//                                                      strideA,
-//                                                      &h_beta,
-//                                                      C,
-//                                                      CUDA_R_16BF,
-//                                                      ldc,
-//                                                      strideC,
-//                                                      batchCount,
-//                                                      CUBLAS_COMPUTE_32F,
-//                                                      algo));
-//       });
-// #else
-//   // raise error
-//   PADDLE_THROW(phi::errors::Unimplemented(
-//       "cublasGemmStridedBatchedEx with bfloat16 is not supported on cuda <= "
-//       "11"));
-// #endif  // CUDA_VERSION >= 11000
-// }
-
-template <>
-template <typename T>
-void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
-                                        CBLAS_TRANSPOSE transB,
-                                        int M,
-                                        int N,
-                                        int K,
-                                        T alpha,
-                                        const T **A,
-                                        const T **B,
-                                        T beta,
-                                        T **C,
-                                        int batchCount) const {
-  for (int k = 0; k < batchCount; ++k) {
-    this->template GEMM<T>(
-        transA, transB, M, N, K, alpha, A[k], B[k], beta, C[k]);
-  }
-}
-
-#if defined(__NVCC__)
-template <>
-template <>
-inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
-                                               CBLAS_TRANSPOSE transB,
-                                               int M,
-                                               int N,
-                                               int K,
-                                               double alpha,
-                                               const double **A,
-                                               const double **B,
-                                               double beta,
-                                               double **C,
-                                               int batchCount) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  thrust::device_vector<const double *> A_ptr(A, A + batchCount);
-  thrust::device_vector<const double *> B_ptr(B, B + batchCount);
-  thrust::device_vector<double *> C_ptr(C, C + batchCount);
-
-  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
-    CUBlas<double>::GEMM_BATCH(handle,
-                               cuTransB,
-                               cuTransA,
-                               N,
-                               M,
-                               K,
-                               &alpha,
-                               B_ptr.data().get(),
-                               ldb,
-                               A_ptr.data().get(),
-                               lda,
-                               &beta,
-                               C_ptr.data().get(),
-                               ldc,
-                               batchCount);
-  });
-}
-
-template <>
-template <>
-inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
-                                               CBLAS_TRANSPOSE transB,
-                                               int M,
-                                               int N,
-                                               int K,
-                                               float alpha,
-                                               const float **A,
-                                               const float **B,
-                                               float beta,
-                                               float **C,
-                                               int batchCount) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  thrust::device_vector<const float *> A_ptr(A, A + batchCount);
-  thrust::device_vector<const float *> B_ptr(B, B + batchCount);
-  thrust::device_vector<float *> C_ptr(C, C + batchCount);
-
-  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
-    CUBlas<float>::GEMM_BATCH(handle,
-                              cuTransB,
-                              cuTransA,
-                              N,
-                              M,
-                              K,
-                              &alpha,
-                              B_ptr.data().get(),
-                              ldb,
-                              A_ptr.data().get(),
-                              lda,
-                              &beta,
-                              C_ptr.data().get(),
-                              ldc,
-                              batchCount);
-  });
-}
-
-template <>
-template <>
-inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
-                                               CBLAS_TRANSPOSE transB,
-                                               int M,
-                                               int N,
-                                               int K,
-                                               phi::dtype::float16 alpha,
-                                               const phi::dtype::float16 **A,
-                                               const phi::dtype::float16 **B,
-                                               phi::dtype::float16 beta,
-                                               phi::dtype::float16 **C,
-                                               int batchCount) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-
-  PADDLE_ENFORCE_GE(
-      dev_ctx_.GetComputeCapability(),
-      53,
-      phi::errors::InvalidArgument(
-          "cublas fp16 gemm requires GPU compute capability >= 53,"
-          "but received %d",
-          dev_ctx_.GetComputeCapability()));
-  float f_alpha = static_cast<float>(alpha);
-  float f_beta = static_cast<float>(beta);
-  auto &cuda_ctx = const_cast<phi::GPUContext &>(dev_ctx_);
-  CUBlas<phi::dtype::float16>::GEMM_BATCH(&cuda_ctx,
-                                          cuTransB,
-                                          cuTransA,
-                                          N,
-                                          M,
-                                          K,
-                                          &f_alpha,
-                                          B,
-                                          CUDA_R_16F,
-                                          ldb,
-                                          A,
-                                          CUDA_R_16F,
-                                          lda,
-                                          &f_beta,
-                                          C,
-                                          CUDA_R_16F,
-                                          ldc,
-                                          batchCount,
-                                          CUBLAS_COMPUTE_32F);
-}
-
-template <>
-template <>
-inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
-                                               CBLAS_TRANSPOSE transB,
-                                               int M,
-                                               int N,
-                                               int K,
-                                               phi::dtype::bfloat16 alpha,
-                                               const phi::dtype::bfloat16 **A,
-                                               const phi::dtype::bfloat16 **B,
-                                               phi::dtype::bfloat16 beta,
-                                               phi::dtype::bfloat16 **C,
-                                               int batchCount) const {
-#if CUDA_VERSION >= 11000
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-
-  // PADDLE_ENFORCE_GE(
-  //     dev_ctx_.GetComputeCapability(),
-  //     80,
-  //     phi::errors::InvalidArgument(
-  //         "cublas bf16 gemm requires GPU compute capability >= 80,"
-  //         "but received %d",
-  //         dev_ctx_.GetComputeCapability()));
-
-  float f_alpha = static_cast<float>(alpha);
-  float f_beta = static_cast<float>(beta);
-
-  cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT;
-  bool use_tensor_op_math = dev_ctx_.tensor_core_available();
-  if (use_tensor_op_math) {
-    algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
-  }
-  VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False");
-
-  thrust::device_vector<const void *> A_ptr(A, A + batchCount);
-  thrust::device_vector<const void *> B_ptr(B, B + batchCount);
-  thrust::device_vector<void *> C_ptr(C, C + batchCount);
-  dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cublasGemmBatchedEx(handle,
-                                          cuTransB,
-                                          cuTransA,
-                                          N,
-                                          M,
-                                          K,
-                                          &f_alpha,
-                                          B_ptr.data().get(),
-                                          CUDA_R_16BF,
-                                          ldb,
-                                          A_ptr.data().get(),
-                                          CUDA_R_16BF,
-                                          lda,
-                                          &f_beta,
-                                          C_ptr.data().get(),
-                                          CUDA_R_16BF,
-                                          ldc,
-                                          batchCount,
-                                          CUBLAS_COMPUTE_32F,
-                                          algo));
-  });
-#else
-  // raise error
-  PADDLE_THROW(phi::errors::Unimplemented(
-      "cublasGemmBatchedEx with bfloat16 is not supported on cuda <= 11"));
-
-#endif  // CUDA_VERSION >= 11000
-}
-#endif
-
-template <>
-template <typename T>
-void Blas<phi::GPUContext>::TRSM(CBLAS_SIDE side,
-                                 CBLAS_UPLO uplo,
-                                 CBLAS_TRANSPOSE transA,
-                                 CBLAS_DIAG diag,
-                                 int M,
-                                 int N,
-                                 T alpha,
-                                 const T *A,
-                                 int lda,
-                                 T *B,
-                                 int ldb) const {
-  // solve row major `op ( A ) X = α B` by taking it as `X' op ( A' )  =  α B'`
-  // where ' stands for transpose
-  cublasSideMode_t cuSide =
-      (side == CblasLeft) ? CUBLAS_SIDE_RIGHT : CUBLAS_SIDE_LEFT;
-  cublasFillMode_t cuUplo =
-      (uplo == CblasLower) ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
-  // use CUBLAS_OP_C (conjugate transpose) for complex
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasDiagType_t cuDiag =
-      (diag == CblasUnit) ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT;
-
-  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
-    CUBlas<T>::TRSM(
-        handle, cuSide, cuUplo, cuTransA, cuDiag, N, M, &alpha, A, lda, B, ldb);
-  });
-}
-
-template <>
-template <typename T>
-void Blas<phi::GPUContext>::BatchedGETRF(
-    int n, T **a, int *ipiv, int *info, int batch_size) const {
-  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
-    CUBlas<T>::GETRF_BATCH(handle, n, a, n, ipiv, info, batch_size);
-  });
-}
-
-template <>
-template <typename T>
-void Blas<phi::GPUContext>::BatchedGETRI(int n,
-                                         const T **a,
-                                         const int *ipiv,
-                                         T **a_inv,
-                                         int *info,
-                                         int batch_size) const {
-  PADDLE_ENFORCE_NE(
-      a_inv,
-      a,
-      phi::errors::InvalidArgument(
-          "cuBLAS fuction 'cublas<S/D>getrfBatched' cannot be executed "
-          "in-place. The memory space of output matrix (address: %p) cannot "
-          "overlap memory space of input matrix (address: %p).",
-          a_inv,
-          a));
-  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
-    CUBlas<T>::GETRI_BATCH(handle, n, a, n, ipiv, a_inv, n, info, batch_size);
-  });
-}
-
-template <>
-template <typename T>
-void Blas<phi::GPUContext>::BatchedMatInv(
-    int n, const T **a, T **a_inv, int *info, int batch_size) const {
-  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
-    CUBlas<T>::MATINV_BATCH(handle, n, a, n, a_inv, n, info, batch_size);
-  });
-}
-
-template <>
-template <typename T>
-void Blas<phi::GPUContext>::BatchedGETRS(CBLAS_TRANSPOSE trans,
-                                         int n,
-                                         int nrhs,
-                                         const T **a,
-                                         int lda,
-                                         int *ipiv,
-                                         T **b,
-                                         int ldb,
-                                         int *info,
-                                         int batch_size) const {
-  // use CUBLAS_OP_C (conjugate transpose) for complex
-  cublasOperation_t cuTrans =
-      (trans == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
-    CUBlas<T>::GETRS_BATCH(
-        handle, cuTrans, n, nrhs, a, lda, ipiv, b, ldb, info, batch_size);
-  });
-}
-
-template <>
-template <typename T>
-void Blas<phi::GPUContext>::BatchedTRSM(CBLAS_SIDE side,
-                                        CBLAS_UPLO uplo,
-                                        CBLAS_TRANSPOSE transA,
-                                        CBLAS_DIAG diag,
-                                        int M,
-                                        int N,
-                                        T alpha,
-                                        const T **A,
-                                        int lda,
-                                        T **B,
-                                        int ldb,
-                                        int batch_size) const {
-  // solve row major `op ( A ) X = α B` by taking it as `X' op ( A' )  =  α B'`
-  // where ' stands for transpose
-  cublasSideMode_t cuSide =
-      (side == CblasLeft) ? CUBLAS_SIDE_RIGHT : CUBLAS_SIDE_LEFT;
-  cublasFillMode_t cuUplo =
-      (uplo == CblasLower) ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
-  // use CUBLAS_OP_C (conjugate transpose) for complex
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasDiagType_t cuDiag =
-      (diag == CblasUnit) ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT;
-
-  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
-    CUBlas<T>::TRSM_BATCH(handle,
-                          cuSide,
-                          cuUplo,
-                          cuTransA,
-                          cuDiag,
-                          N,
-                          M,
-                          &alpha,
-                          A,
-                          lda,
-                          B,
-                          ldb,
-                          batch_size);
-  });
-}
-
-}  // namespace funcs
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/funcs/blas/blas_impl.h b/backends/metax_gpu/kernels/funcs/blas/blas_impl.h
deleted file mode 100644
index cb59d73bef8..00000000000
--- a/backends/metax_gpu/kernels/funcs/blas/blas_impl.h
+++ /dev/null
@@ -1,2003 +0,0 @@
-// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights
-// Reserved.
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include <algorithm>
-#include <cmath>
-#include <limits>
-#include <vector>
-
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/common/bfloat16.h"
-#include "paddle/phi/common/complex.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-#define INT_MAX_VALUE 2147483647
-
-namespace phi {
-namespace funcs {
-
-namespace detail {
-template <typename T>
-static void axpy(
-    int n, const T alpha, const T *x, const int incx, T *y, const int incy) {
-  // Y = Y + alpha * X
-  while (n-- > 0) {
-    *y += alpha * *x;
-    y = y + incy;
-    x = x + incx;
-  }
-}
-}  // namespace detail
-
-template <typename T>
-struct CBlas;
-
-template <>
-struct CBlas<int8_t> {
-  template <typename... ARGS>
-  static void VCOPY(ARGS... args) {
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "Blas VCOPY do not supported on CPU, please check your code"));
-  }
-};
-
-template <>
-struct CBlas<int16_t> {
-  template <typename... ARGS>
-  static void VCOPY(ARGS... args) {
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "Blas VCOPY do not supported on CPU, please check your code"));
-  }
-};
-
-template <>
-struct CBlas<phi::dtype::bfloat16> {
-  template <typename... ARGS>
-  static void AXPY(ARGS... args) {
-    detail::axpy(args...);
-  }
-
-  template <typename... ARGS>
-  static void VCOPY(ARGS... args UNUSED) {
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "Blas VCOPY do not supported on CPU with bfloat16,"
-        " please check your code"));
-  }
-
-  template <typename... ARGS>
-  static void VADD(int n,
-                   const phi::dtype::bfloat16 *x,
-                   const phi::dtype::bfloat16 *y,
-                   phi::dtype::bfloat16 *z) {
-    for (int i = 0; i < n; ++i) {
-      z[i] = x[i] + y[i];
-    }
-  }
-
-  template <typename... ARGS>
-  static void VMUL(int n,
-                   const phi::dtype::bfloat16 *x,
-                   const phi::dtype::bfloat16 *y,
-                   phi::dtype::bfloat16 *z) {
-    for (int i = 0; i < n; ++i) {
-      z[i] = x[i] * y[i];
-    }
-  }
-
-  template <typename... ARGS>
-  static void VSUB(int n,
-                   const phi::dtype::bfloat16 *x,
-                   const phi::dtype::bfloat16 *y,
-                   phi::dtype::bfloat16 *z) {
-    for (int i = 0; i < n; ++i) {
-      z[i] = x[i] - y[i];
-    }
-  }
-};
-
-#ifdef PADDLE_WITH_MKLML
-template <>
-struct CBlas<float> {
-  template <typename... ARGS>
-  static void GEMM(ARGS... args) {
-    phi::dynload::cblas_sgemm(args...);
-  }
-
-  template <typename... ARGS>
-  static float *GEMM_ALLOC(ARGS... args) {
-    return phi::dynload::cblas_sgemm_alloc(args...);
-  }
-
-  template <typename... ARGS>
-  static void GEMM_PACK(ARGS... args) {
-    phi::dynload::cblas_sgemm_pack(args...);
-  }
-
-  template <typename... ARGS>
-  static void GEMM_COMPUTE(ARGS... args) {
-    phi::dynload::cblas_sgemm_compute(args...);
-  }
-
-  template <typename... ARGS>
-  static void GEMM_FREE(ARGS... args) {
-    phi::dynload::cblas_sgemm_free(args...);
-  }
-
-#ifdef PADDLE_WITH_LIBXSMM
-  template <typename... ARGS>
-  static void SMM_GEMM(ARGS... args) {
-    libxsmm_sgemm(args...);
-  }
-#endif
-
-  template <typename... ARGS>
-  static void AXPY(ARGS... args) {
-    phi::dynload::cblas_saxpy(args...);
-  }
-
-  template <typename... ARGS>
-  static void VCOPY(ARGS... args) {
-    phi::dynload::cblas_scopy(args...);
-  }
-
-  template <typename... ARGS>
-  static void GEMV(ARGS... args) {
-    phi::dynload::cblas_sgemv(args...);
-  }
-
-  template <typename... ARGS>
-  static float DOT(ARGS... args) {
-    return phi::dynload::cblas_sdot(args...);
-  }
-
-  template <typename... ARGS>
-  static void SCAL(ARGS... args) {
-    phi::dynload::cblas_sscal(args...);
-  }
-
-  template <typename... ARGS>
-  static float ASUM(ARGS... args) {
-    return phi::dynload::cblas_sasum(args...);
-  }
-
-  template <typename... ARGS>
-  static void GEMM_BATCH(ARGS... args) {
-    phi::dynload::cblas_sgemm_batch(args...);
-  }
-
-  template <typename... ARGS>
-  static void VADD(ARGS... args) {
-    phi::dynload::vsAdd(args...);
-  }
-
-  template <typename... ARGS>
-  static void VSUB(ARGS... args) {
-    phi::dynload::vsSub(args...);
-  }
-
-  template <typename... ARGS>
-  static void VMUL(ARGS... args) {
-    phi::dynload::vsMul(args...);
-  }
-
-  template <typename... ARGS>
-  static void VDIV(ARGS... args) {
-    phi::dynload::vsDiv(args...);
-  }
-
-  template <typename... ARGS>
-  static void VEXP(ARGS... args) {
-    phi::dynload::vsExp(args...);
-  }
-
-  template <typename... ARGS>
-  static void VSQUARE(ARGS... args) {
-    phi::dynload::vsSqr(args...);
-  }
-
-  template <typename... ARGS>
-  static void VPOW(ARGS... args) {
-    phi::dynload::vsPowx(args...);
-  }
-
-  template <typename... ARGS>
-  static void VINV(ARGS... args) {
-    phi::dynload::vsInv(args...);
-  }
-
-  template <typename... ARGS>
-  static void VMERF(ARGS... args) {
-    phi::dynload::vmsErf(args...);
-  }
-#if !defined(_WIN32)
-  template <typename... ARGS>
-  static void CSRMM(ARGS... args) {
-    phi::dynload::mkl_scsrmm(args...);
-  }
-#endif
-
-  template <typename... ARGS>
-  static void TRSM(ARGS... args) {
-    phi::dynload::cblas_strsm(args...);
-  }
-};
-
-template <>
-struct CBlas<double> {
-  template <typename... ARGS>
-  static void GEMM(ARGS... args) {
-    phi::dynload::cblas_dgemm(args...);
-  }
-
-  template <typename... ARGS>
-  static double *GEMM_ALLOC(ARGS... args) {
-    return phi::dynload::cblas_dgemm_alloc(args...);
-  }
-
-  template <typename... ARGS>
-  static void GEMM_PACK(ARGS... args) {
-    phi::dynload::cblas_dgemm_pack(args...);
-  }
-
-  template <typename... ARGS>
-  static void GEMM_COMPUTE(ARGS... args) {
-    phi::dynload::cblas_dgemm_compute(args...);
-  }
-
-  template <typename... ARGS>
-  static void GEMM_FREE(ARGS... args) {
-    phi::dynload::cblas_dgemm_free(args...);
-  }
-
-#ifdef PADDLE_WITH_LIBXSMM
-  template <typename... ARGS>
-  static void SMM_GEMM(ARGS... args) {
-    libxsmm_dgemm(args...);
-  }
-#endif
-
-  template <typename... ARGS>
-  static void AXPY(ARGS... args) {
-    phi::dynload::cblas_daxpy(args...);
-  }
-
-  template <typename... ARGS>
-  static void VCOPY(ARGS... args) {
-    phi::dynload::cblas_dcopy(args...);
-  }
-
-  template <typename... ARGS>
-  static void GEMV(ARGS... args) {
-    phi::dynload::cblas_dgemv(args...);
-  }
-
-  template <typename... ARGS>
-  static double DOT(ARGS... args) {
-    return phi::dynload::cblas_ddot(args...);
-  }
-
-  template <typename... ARGS>
-  static void SCAL(ARGS... args) {
-    phi::dynload::cblas_dscal(args...);
-  }
-
-  template <typename... ARGS>
-  static double ASUM(ARGS... args) {
-    return phi::dynload::cblas_dasum(args...);
-  }
-
-  template <typename... ARGS>
-  static void GEMM_BATCH(ARGS... args) {
-    phi::dynload::cblas_dgemm_batch(args...);
-  }
-
-  template <typename... ARGS>
-  static void VADD(ARGS... args) {
-    phi::dynload::vdAdd(args...);
-  }
-
-  template <typename... ARGS>
-  static void VSUB(ARGS... args) {
-    phi::dynload::vdSub(args...);
-  }
-
-  template <typename... ARGS>
-  static void VMUL(ARGS... args) {
-    phi::dynload::vdMul(args...);
-  }
-
-  template <typename... ARGS>
-  static void VDIV(ARGS... args) {
-    phi::dynload::vdDiv(args...);
-  }
-
-  template <typename... ARGS>
-  static void VEXP(ARGS... args) {
-    phi::dynload::vdExp(args...);
-  }
-
-  template <typename... ARGS>
-  static void VSQUARE(ARGS... args) {
-    phi::dynload::vdSqr(args...);
-  }
-
-  template <typename... ARGS>
-  static void VPOW(ARGS... args) {
-    phi::dynload::vdPowx(args...);
-  }
-
-  template <typename... ARGS>
-  static void VINV(ARGS... args) {
-    phi::dynload::vdInv(args...);
-  }
-
-  template <typename... ARGS>
-  static void VMERF(ARGS... args) {
-    phi::dynload::vmdErf(args...);
-  }
-#if !defined(_WIN32)
-  template <typename... ARGS>
-  static void CSRMM(ARGS... args) {
-    phi::dynload::mkl_dcsrmm(args...);
-  }
-#endif
-
-  template <typename... ARGS>
-  static void TRSM(ARGS... args) {
-    phi::dynload::cblas_dtrsm(args...);
-  }
-};
-
-template <>
-struct CBlas<phi::dtype::complex<float>> {
-  template <typename... ARGS>
-  static void AXPY(int n,
-                   const phi::dtype::complex<float> alpha,
-                   const phi::dtype::complex<float> *X,
-                   const int incX,
-                   phi::dtype::complex<float> *Y,
-                   const int incY) {
-    phi::dynload::cblas_caxpy(n, &alpha, X, incX, Y, incY);
-  }
-
-  template <typename... ARGS>
-  static void VCOPY(ARGS... args) {
-    phi::dynload::cblas_ccopy(args...);
-  }
-
-  // the libmklml_intel.so paddle used has no vcAdd, vcSub,
-  // vcMul, vcDiv apis before rebuild from source
-  // so replace with the raw operator methods
-  /*
-  template <typename... ARGS>
-  static void VADD(ARGS... args) {
-    phi::dynload::vcAdd(args...);
-  }
-
-  template <typename... ARGS>
-  static void VSUB(ARGS... args) {
-    phi::dynload::vcSub(args...);
-  }
-
-  template <typename... ARGS>
-  static void VMUL(ARGS... args) {
-    phi::dynload::vcMul(args...);
-  }
-
-  template <typename... ARGS>
-  static void VDIV(ARGS... args) {
-    phi::dynload::vcDiv(args...);
-  }
-  */
-
-  template <typename... ARGS>
-  static void VADD(int n,
-                   const phi::dtype::complex<float> *a,
-                   const phi::dtype::complex<float> *b,
-                   phi::dtype::complex<float> *y) {
-    for (int i = 0; i < n; ++i) {
-      y[i] = a[i] + b[i];
-    }
-  }
-
-  template <typename... ARGS>
-  static void VSUB(int n,
-                   const phi::dtype::complex<float> *a,
-                   const phi::dtype::complex<float> *b,
-                   phi::dtype::complex<float> *y) {
-    for (int i = 0; i < n; ++i) {
-      y[i] = a[i] - b[i];
-    }
-  }
-
-  template <typename... ARGS>
-  static void VMUL(int n,
-                   const phi::dtype::complex<float> *a,
-                   const phi::dtype::complex<float> *b,
-                   phi::dtype::complex<float> *y) {
-    for (int i = 0; i < n; ++i) {
-      y[i] = a[i] * b[i];
-    }
-  }
-  template <typename... ARGS>
-  static void VDIV(int n,
-                   const phi::dtype::complex<float> *a,
-                   const phi::dtype::complex<float> *b,
-                   phi::dtype::complex<float> *y) {
-    for (int i = 0; i < n; ++i) {
-      y[i] = a[i] / b[i];
-    }
-  }
-
-  template <typename... ARGS>
-  static void GEMV(CBLAS_LAYOUT layout,
-                   CBLAS_TRANSPOSE trans,
-                   int M,
-                   int N,
-                   phi::dtype::complex<float> alpha,
-                   const phi::dtype::complex<float> *A,
-                   int lda,
-                   const phi::dtype::complex<float> *X,
-                   int incx,
-                   phi::dtype::complex<float> beta,
-                   phi::dtype::complex<float> *Y,
-                   int incy) {
-    const void *a_ = (const void *)(A);
-    const void *x_ = (const void *)(X);
-    void *y_ = static_cast<void *>(Y);
-    phi::dynload::cblas_cgemv(
-        layout, trans, M, N, &alpha, a_, lda, x_, incx, &beta, y_, incy);
-  }
-
-  template <typename... ARGS>
-  static void GEMM(CBLAS_LAYOUT layout,
-                   CBLAS_TRANSPOSE trans_a,
-                   CBLAS_TRANSPOSE trans_b,
-                   int M,
-                   int N,
-                   int K,
-                   phi::dtype::complex<float> alpha,
-                   const phi::dtype::complex<float> *A,
-                   int lda,
-                   const phi::dtype::complex<float> *B,
-                   int ldb,
-                   phi::dtype::complex<float> beta,
-                   phi::dtype::complex<float> *C,
-                   int ldc) {
-    const void *a_ = (const void *)(A);
-    const void *b_ = (const void *)(B);
-    void *c_ = static_cast<void *>(C);
-    phi::dynload::cblas_cgemm(layout,
-                              trans_a,
-                              trans_b,
-                              M,
-                              N,
-                              K,
-                              &alpha,
-                              a_,
-                              lda,
-                              b_,
-                              ldb,
-                              &beta,
-                              c_,
-                              ldc);
-  }
-
-  static void TRSM(CBLAS_LAYOUT layout,
-                   CBLAS_SIDE side,
-                   CBLAS_UPLO uplo,
-                   CBLAS_TRANSPOSE trans_a,
-                   CBLAS_DIAG diag,
-                   int M,
-                   int N,
-                   phi::dtype::complex<float> alpha,
-                   const phi::dtype::complex<float> *A,
-                   int lda,
-                   phi::dtype::complex<float> *B,
-                   int ldb) {
-    const void *a_ = (const void *)(A);
-    void *b_ = static_cast<void *>(B);
-    phi::dynload::cblas_ctrsm(
-        layout, side, uplo, trans_a, diag, M, N, &alpha, a_, lda, b_, ldb);
-  }
-
-  template <typename... ARGS>
-  static void GEMM_BATCH(CBLAS_LAYOUT layout,
-                         CBLAS_TRANSPOSE *trans_a,
-                         CBLAS_TRANSPOSE *trans_b,
-                         int *M,
-                         int *N,
-                         int *K,
-                         phi::dtype::complex<float> *alpha,
-                         const phi::dtype::complex<float> **A,
-                         const int *lda,
-                         const phi::dtype::complex<float> **B,
-                         const int *ldb,
-                         phi::dtype::complex<float> *beta,
-                         phi::dtype::complex<float> **C,
-                         const int *ldc,
-                         int group_count,
-                         int *group_size) {
-    const void **A_void = (const void **)(&(*A));
-    const void **B_void = (const void **)(&(*B));
-    void **C_void = reinterpret_cast<void **>(C);
-
-    phi::dynload::cblas_cgemm_batch(layout,
-                                    trans_a,
-                                    trans_b,
-                                    M,
-                                    N,
-                                    K,
-                                    alpha,
-                                    A_void,
-                                    lda,
-                                    B_void,
-                                    ldb,
-                                    beta,
-                                    C_void,
-                                    ldc,
-                                    group_count,
-                                    group_size);
-  }
-
-  template <typename... ARGS>
-  static void GEMM_EX(ARGS... args) {
-    phi::dynload::cblas_cgemm_batch(args...);
-  }
-};
-
-template <>
-struct CBlas<phi::dtype::complex<double>> {
-  template <typename... ARGS>
-  static void AXPY(int n,
-                   const phi::dtype::complex<double> alpha,
-                   const phi::dtype::complex<double> *X,
-                   const int incX,
-                   phi::dtype::complex<double> *Y,
-                   const int incY) {
-    phi::dynload::cblas_zaxpy(n, &alpha, X, incX, Y, incY);
-  }
-
-  template <typename... ARGS>
-  static void VCOPY(ARGS... args) {
-    phi::dynload::cblas_zcopy(args...);
-  }
-
-  // the libmklml_intel.so paddle used has no vzAdd, vzSub,
-  // vzMul, vzDiv apis before rebuild from source
-  // so replace with the raw operator methods
-  /*
-  template <typename... ARGS>
-  static void VADD(ARGS... args) {
-    phi::dynload::vzAdd(args...);
-  }
-
-  template <typename... ARGS>
-  static void VSUB(ARGS... args) {
-    phi::dynload::vzSub(args...);
-  }
-
-  template <typename... ARGS>
-  static void VMUL(ARGS... args) {
-    phi::dynload::vzMul(args...);
-  }
-
-  template <typename... ARGS>
-  static void VDIV(ARGS... args) {
-    phi::dynload::vzDiv(args...);
-  }
-  */
-
-  template <typename... ARGS>
-  static void VADD(int n,
-                   const phi::dtype::complex<double> *a,
-                   const phi::dtype::complex<double> *b,
-                   phi::dtype::complex<double> *y) {
-    for (int i = 0; i < n; ++i) {
-      y[i] = a[i] + b[i];
-    }
-  }
-
-  template <typename... ARGS>
-  static void VSUB(int n,
-                   const phi::dtype::complex<double> *a,
-                   const phi::dtype::complex<double> *b,
-                   phi::dtype::complex<double> *y) {
-    for (int i = 0; i < n; ++i) {
-      y[i] = a[i] - b[i];
-    }
-  }
-
-  template <typename... ARGS>
-  static void VMUL(int n,
-                   const phi::dtype::complex<double> *a,
-                   const phi::dtype::complex<double> *b,
-                   phi::dtype::complex<double> *y) {
-    for (int i = 0; i < n; ++i) {
-      y[i] = a[i] * b[i];
-    }
-  }
-  template <typename... ARGS>
-  static void VDIV(int n,
-                   const phi::dtype::complex<double> *a,
-                   const phi::dtype::complex<double> *b,
-                   phi::dtype::complex<double> *y) {
-    for (int i = 0; i < n; ++i) {
-      y[i] = a[i] / b[i];
-    }
-  }
-
-  template <typename... ARGS>
-  static void GEMV(CBLAS_LAYOUT layout,
-                   CBLAS_TRANSPOSE trans,
-                   int M,
-                   int N,
-                   phi::dtype::complex<double> alpha,
-                   const phi::dtype::complex<double> *A,
-                   int lda,
-                   const phi::dtype::complex<double> *X,
-                   int incx,
-                   phi::dtype::complex<double> beta,
-                   phi::dtype::complex<double> *Y,
-                   int incy) {
-    const void *a_ = (const void *)(A);
-    const void *x_ = (const void *)(X);
-    void *y_ = static_cast<void *>(Y);
-    phi::dynload::cblas_zgemv(
-        layout, trans, M, N, &alpha, a_, lda, x_, incx, &beta, y_, incy);
-  }
-
-  template <typename... ARGS>
-  static void GEMM(CBLAS_LAYOUT layout,
-                   CBLAS_TRANSPOSE trans_a,
-                   CBLAS_TRANSPOSE trans_b,
-                   int M,
-                   int N,
-                   int K,
-                   phi::dtype::complex<double> alpha,
-                   const phi::dtype::complex<double> *A,
-                   int lda,
-                   const phi::dtype::complex<double> *B,
-                   int ldb,
-                   phi::dtype::complex<double> beta,
-                   phi::dtype::complex<double> *C,
-                   int ldc) {
-    const void *a_ = (const void *)(A);
-    const void *b_ = (const void *)(B);
-    void *c_ = static_cast<void *>(C);
-    phi::dynload::cblas_zgemm(layout,
-                              trans_a,
-                              trans_b,
-                              M,
-                              N,
-                              K,
-                              &alpha,
-                              a_,
-                              lda,
-                              b_,
-                              ldb,
-                              &beta,
-                              c_,
-                              ldc);
-  }
-
-  static void TRSM(CBLAS_LAYOUT layout,
-                   CBLAS_SIDE side,
-                   CBLAS_UPLO uplo,
-                   CBLAS_TRANSPOSE trans_a,
-                   CBLAS_DIAG diag,
-                   int M,
-                   int N,
-                   phi::dtype::complex<double> alpha,
-                   const phi::dtype::complex<double> *A,
-                   int lda,
-                   phi::dtype::complex<double> *B,
-                   int ldb) {
-    const void *a_ = (const void *)(A);
-    void *b_ = static_cast<void *>(B);
-    phi::dynload::cblas_ztrsm(
-        layout, side, uplo, trans_a, diag, M, N, &alpha, a_, lda, b_, ldb);
-  }
-
-  template <typename... ARGS>
-  static void GEMM_BATCH(CBLAS_LAYOUT layout,
-                         CBLAS_TRANSPOSE *trans_a,
-                         CBLAS_TRANSPOSE *trans_b,
-                         int *M,
-                         int *N,
-                         int *K,
-                         phi::dtype::complex<double> *alpha,
-                         const phi::dtype::complex<double> **A,
-                         const int *lda,
-                         const phi::dtype::complex<double> **B,
-                         const int *ldb,
-                         phi::dtype::complex<double> *beta,
-                         phi::dtype::complex<double> **C,
-                         const int *ldc,
-                         int group_count,
-                         int *group_size) {
-    const void **A_void = (const void **)(&(*A));
-    const void **B_void = (const void **)(&(*B));
-    void **C_void = reinterpret_cast<void **>(C);
-
-    phi::dynload::cblas_zgemm_batch(layout,
-                                    trans_a,
-                                    trans_b,
-                                    M,
-                                    N,
-                                    K,
-                                    alpha,
-                                    A_void,
-                                    lda,
-                                    B_void,
-                                    ldb,
-                                    beta,
-                                    C_void,
-                                    ldc,
-                                    group_count,
-                                    group_size);
-  }
-
-  template <typename... ARGS>
-  static void GEMM_EX(ARGS... args) {
-    phi::dynload::cblas_zgemm_batch(args...);
-  }
-};
-
-#else
-
-template <>
-struct CBlas<float> {
-  template <typename... ARGS>
-  static void GEMM(ARGS... args) {
-    cblas_sgemm(args...);
-  }
-
-  template <typename... ARGS>
-  static void AXPY(ARGS... args) {
-    cblas_saxpy(args...);
-  }
-
-  template <typename... ARGS>
-  static void VCOPY(ARGS... args) {
-    cblas_scopy(args...);
-  }
-
-  template <typename... ARGS>
-  static void GEMV(ARGS... args) {
-    cblas_sgemv(args...);
-  }
-
-  template <typename... ARGS>
-  static void TRSM(ARGS... args) {
-    cblas_strsm(args...);
-  }
-};
-
-template <>
-struct CBlas<double> {
-  template <typename... ARGS>
-  static void GEMM(ARGS... args) {
-    cblas_dgemm(args...);
-  }
-
-  template <typename... ARGS>
-  static void AXPY(ARGS... args) {
-    cblas_daxpy(args...);
-  }
-
-  template <typename... ARGS>
-  static void VCOPY(ARGS... args) {
-    cblas_dcopy(args...);
-  }
-
-  template <typename... ARGS>
-  static void GEMV(ARGS... args) {
-    cblas_dgemv(args...);
-  }
-
-  template <typename... ARGS>
-  static void TRSM(ARGS... args) {
-    cblas_dtrsm(args...);
-  }
-};
-
-template <>
-struct CBlas<phi::dtype::complex<float>> {
-  template <typename... ARGS>
-  static void VCOPY(ARGS... args) {
-    cblas_ccopy(args...);
-  }
-
-  template <typename... ARGS>
-  static void AXPY(int n,
-                   const phi::dtype::complex<float> alpha,
-                   const phi::dtype::complex<float> *X,
-                   const int incX,
-                   phi::dtype::complex<float> *Y,
-                   const int incY) {
-    cblas_caxpy(n, &alpha, X, incX, Y, incY);
-  }
-
-  template <typename... ARGS>
-  static void GEMV(const CBLAS_LAYOUT layout,
-                   const CBLAS_TRANSPOSE TransA,
-                   const int M,
-                   const int N,
-                   const phi::dtype::complex<float> alpha,
-                   const phi::dtype::complex<float> *A,
-                   const int lda,
-                   const phi::dtype::complex<float> *X,
-                   const int incX,
-                   const phi::dtype::complex<float> beta,
-                   phi::dtype::complex<float> *Y,
-                   const int incY) {
-    cblas_cgemv(layout, TransA, M, N, &alpha, A, lda, X, incX, &beta, Y, incY);
-  }
-
-  template <typename... ARGS>
-  static void GEMM(const CBLAS_LAYOUT layout,
-                   const CBLAS_TRANSPOSE TransA,
-                   const CBLAS_TRANSPOSE TransB,
-                   const int M,
-                   const int N,
-                   const int K,
-                   const phi::dtype::complex<float> alpha,
-                   const phi::dtype::complex<float> *A,
-                   const int lda,
-                   const phi::dtype::complex<float> *B,
-                   const int ldb,
-                   const phi::dtype::complex<float> beta,
-                   phi::dtype::complex<float> *C,
-                   const int ldc) {
-    cblas_cgemm(
-        layout, TransA, TransB, M, N, K, &alpha, A, lda, B, ldb, &beta, C, ldc);
-  }
-
-  static void TRSM(const CBLAS_LAYOUT layout,
-                   const CBLAS_SIDE side,
-                   const CBLAS_UPLO uplo,
-                   const CBLAS_TRANSPOSE transA,
-                   const CBLAS_DIAG diag,
-                   const int M,
-                   const int N,
-                   const phi::dtype::complex<float> alpha,
-                   const phi::dtype::complex<float> *A,
-                   const int lda,
-                   phi::dtype::complex<double> *B,
-                   const int ldb) {
-    cblas_ctrsm(layout, side, uplo, transA, diag, M, N, &alpha, A, lda, B, ldb);
-  }
-};
-
-template <>
-struct CBlas<phi::dtype::complex<double>> {
-  template <typename... ARGS>
-  static void VCOPY(ARGS... args) {
-    cblas_zcopy(args...);
-  }
-
-  template <typename... ARGS>
-  static void AXPY(int n,
-                   const phi::dtype::complex<double> alpha,
-                   const phi::dtype::complex<double> *X,
-                   const int incX,
-                   phi::dtype::complex<double> *Y,
-                   const int incY) {
-    cblas_zaxpy(n, &alpha, X, incX, Y, incY);
-  }
-
-  template <typename... ARGS>
-  static void GEMV(const CBLAS_LAYOUT layout,
-                   const CBLAS_TRANSPOSE TransA,
-                   const int M,
-                   const int N,
-                   const phi::dtype::complex<double> alpha,
-                   const phi::dtype::complex<double> *A,
-                   const int lda,
-                   const phi::dtype::complex<double> *X,
-                   const int incX,
-                   const phi::dtype::complex<double> beta,
-                   phi::dtype::complex<double> *Y,
-                   const int incY) {
-    cblas_zgemv(layout, TransA, M, N, &alpha, A, lda, X, incX, &beta, Y, incY);
-  }
-
-  template <typename... ARGS>
-  static void GEMM(const CBLAS_LAYOUT layout,
-                   const CBLAS_TRANSPOSE TransA,
-                   const CBLAS_TRANSPOSE TransB,
-                   const int M,
-                   const int N,
-                   const int K,
-                   const phi::dtype::complex<double> alpha,
-                   const phi::dtype::complex<double> *A,
-                   const int lda,
-                   const phi::dtype::complex<double> *B,
-                   const int ldb,
-                   const phi::dtype::complex<double> beta,
-                   phi::dtype::complex<double> *C,
-                   const int ldc) {
-    cblas_zgemm(
-        layout, TransA, TransB, M, N, K, &alpha, A, lda, B, ldb, &beta, C, ldc);
-  }
-
-  static void TRSM(const CBLAS_LAYOUT layout,
-                   const CBLAS_SIDE side,
-                   const CBLAS_UPLO uplo,
-                   const CBLAS_TRANSPOSE transA,
-                   const CBLAS_DIAG diag,
-                   const int M,
-                   const int N,
-                   const phi::dtype::complex<double> alpha,
-                   const phi::dtype::complex<double> *A,
-                   const int lda,
-                   phi::dtype::complex<double> *B,
-                   const int ldb) {
-    cblas_ztrsm(layout, side, uplo, transA, diag, M, N, &alpha, A, lda, B, ldb);
-  }
-};
-
-#endif
-
-template <>
-struct CBlas<phi::dtype::float16> {
-  static void GEMM(...) {
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "float16 GEMM not supported on CPU, please check your code"));
-  }
-
-  static void SMM_GEMM(...) {
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "float16 SMM_GEMM not supported on CPU, please check your code"));
-  }
-  static void VMUL(...) {
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "float16 VMUL not supported on CPU, please check your code"));
-  }
-  static void VEXP(...) {
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "float16 VEXP not supported on CPU, please check your code"));
-  }
-  static void VSQUARE(...) {
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "float16 VSQUARE not supported on CPU, please check your code"));
-  }
-  static void VPOW(...) {
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "float16 VPOW not supported on CPU, please check your code"));
-  }
-  static void DOT(...) {
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "float16 DOT not supported on CPU, please check your code"));
-  };
-  static void SCAL(...) {
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "float16 SCAL not supported on CPU, please check your code"));
-  };
-  static void ASUM(...) {
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "float16 ASUM not supported on CPU, please check your code"));
-  };
-#ifdef PADDLE_WITH_MKLML
-  static void GEMM_BATCH(...) {
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "float16 GEMM_BATCH not supported on CPU, please check your code"));
-  }
-#endif
-};
-
-#ifdef PADDLE_WITH_MKLML
-template <>
-template <typename T>
-T *Blas<phi::CPUContext>::GEMM_ALLOC(const CBLAS_IDENTIFIER id,
-                                     const int M,
-                                     const int N,
-                                     const int K) const {
-  return CBlas<T>::GEMM_ALLOC(id, M, N, K);
-}
-
-template <>
-template <typename T>
-void Blas<phi::CPUContext>::GEMM_PACK(const CBLAS_IDENTIFIER id,
-                                      const CBLAS_TRANSPOSE trans,
-                                      int M,
-                                      int N,
-                                      int K,
-                                      const T alpha,
-                                      const T *src,
-                                      const int ld,
-                                      T *dst) const {
-  CBlas<T>::GEMM_PACK(CblasRowMajor, id, trans, M, N, K, alpha, src, ld, dst);
-}
-
-template <>
-template <typename T>
-void Blas<phi::CPUContext>::GEMM_COMPUTE(int transA,
-                                         int transB,
-                                         int M,
-                                         int N,
-                                         int K,
-                                         const T *A,
-                                         const int lda,
-                                         const T *B,
-                                         const int ldb,
-                                         T beta,
-                                         T *C,
-                                         const int ldc) const {
-  CBlas<T>::GEMM_COMPUTE(
-      CblasRowMajor, transA, transB, M, N, K, A, lda, B, ldb, beta, C, ldc);
-}
-
-template <>
-template <typename T>
-void Blas<phi::CPUContext>::GEMM_FREE(T *data) const {
-  CBlas<T>::GEMM_FREE(data);
-}
-#endif
-
-template <>
-template <typename T>
-void Blas<phi::CPUContext>::GEMM(CBLAS_TRANSPOSE transA,
-                                 CBLAS_TRANSPOSE transB,
-                                 int64_t M,
-                                 int64_t N,
-                                 int64_t K,
-                                 T alpha,
-                                 const T *A,
-                                 const T *B,
-                                 T beta,
-                                 T *C) const {
-  if (M > INT_MAX_VALUE || N > INT_MAX_VALUE || K > INT_MAX_VALUE) {
-    PADDLE_THROW(
-        common::errors::Unimplemented("GEMM not supported for large tensor "
-                                      "size on CPU, please check your code!"));
-  }
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  CBlas<T>::GEMM(CblasRowMajor,
-                 transA,
-                 transB,
-                 M,
-                 N,
-                 K,
-                 alpha,
-                 A,
-                 lda,
-                 B,
-                 ldb,
-                 beta,
-                 C,
-                 ldc);
-}
-
-template <>
-template <typename T, typename U>
-void Blas<phi::CPUContext>::GEMM(CBLAS_TRANSPOSE transA,
-                                 CBLAS_TRANSPOSE transB,
-                                 int64_t M,
-                                 int64_t N,
-                                 int64_t K,
-                                 U alpha,
-                                 const T *A,
-                                 const T *B,
-                                 U beta,
-                                 T *C) const {
-  if (M > INT_MAX_VALUE || N > INT_MAX_VALUE || K > INT_MAX_VALUE) {
-    PADDLE_THROW(
-        common::errors::Unimplemented("GEMM not supported for large tensor "
-                                      "size on CPU, please check your code!"));
-  }
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  CBlas<T>::GEMM(CblasRowMajor,
-                 transA,
-                 transB,
-                 static_cast<int>(M),
-                 static_cast<int>(N),
-                 static_cast<int>(K),
-                 alpha,
-                 A,
-                 lda,
-                 B,
-                 ldb,
-                 beta,
-                 C,
-                 ldc);
-}
-
-template <>
-template <typename T>
-void Blas<phi::CPUContext>::GEMM(bool transA,
-                                 bool transB,
-                                 int M,
-                                 int N,
-                                 int K,
-                                 T alpha,
-                                 const T *A,
-                                 int lda,
-                                 const T *B,
-                                 int ldb,
-                                 T beta,
-                                 T *C,
-                                 int ldc) const {
-  CBlas<T>::GEMM(CblasRowMajor,
-                 transA == false ? CblasNoTrans : CblasTrans,
-                 transB == false ? CblasNoTrans : CblasTrans,
-                 M,
-                 N,
-                 K,
-                 alpha,
-                 A,
-                 lda,
-                 B,
-                 ldb,
-                 beta,
-                 C,
-                 ldc);
-}
-
-template <>
-template <typename T>
-void Blas<phi::CPUContext>::GEMM(CBLAS_TRANSPOSE transA,
-                                 CBLAS_TRANSPOSE transB,
-                                 int M,
-                                 int N,
-                                 int K,
-                                 T alpha,
-                                 const T *A,
-                                 int lda,
-                                 const T *B,
-                                 int ldb,
-                                 T beta,
-                                 T *C,
-                                 int ldc) const {
-  CBlas<T>::GEMM(CblasRowMajor,
-                 transA,
-                 transB,
-                 M,
-                 N,
-                 K,
-                 alpha,
-                 A,
-                 lda,
-                 B,
-                 ldb,
-                 beta,
-                 C,
-                 ldc);
-}
-
-template <typename DeviceContext>
-template <typename T>
-void Blas<DeviceContext>::MatMul(const phi::DenseTensor &mat_a,
-                                 bool trans_a,
-                                 const phi::DenseTensor &mat_b,
-                                 bool trans_b,
-                                 T alpha,
-                                 phi::DenseTensor *mat_out,
-                                 T beta) const {
-  const auto &dim_a = mat_a.dims();
-  const auto &dim_b = mat_b.dims();
-  const auto &dim_out = mat_out->dims();
-  PADDLE_ENFORCE_EQ(
-      dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
-      true,
-      phi::errors::InvalidArgument(
-          "The input and output of matmul should be matrix, the dim size must "
-          "be 2,"
-          "but received dim size input_a:%d, input_b:%d, output:%d",
-          dim_a.size(),
-          dim_b.size(),
-          dim_out.size()));
-  PADDLE_ENFORCE_EQ(
-      mat_a.place() == mat_b.place() && mat_a.place() == mat_out->place(),
-      true,
-      phi::errors::InvalidArgument("The places of matrices in the matmul "
-                                   "should be same, please check your "
-                                   "code."));
-
-  int M = dim_out[0];
-  int N = dim_out[1];
-  int K = !trans_a ? dim_a[1] : dim_a[0];
-
-  CBLAS_TRANSPOSE transA = !trans_a ? CblasNoTrans : CblasTrans;
-  CBLAS_TRANSPOSE transB = !trans_b ? CblasNoTrans : CblasTrans;
-
-  this->GEMM(transA,
-             transB,
-             M,
-             N,
-             K,
-             alpha,
-             mat_a.data<T>(),
-             mat_b.data<T>(),
-             beta,
-             mat_out->data<T>());
-}
-
-template <>
-template <typename T>
-void Blas<phi::CPUContext>::AXPY(int n, T alpha, const T *x, T *y) const {
-  CBlas<T>::AXPY(n, alpha, x, 1, y, 1);
-}
-
-template <>
-template <typename T>
-void Blas<phi::CPUContext>::VCOPY(int n, const T *x, T *y) const {
-  CBlas<T>::VCOPY(n, x, 1, y, 1);
-}
-
-template <>
-template <typename T>
-void Blas<phi::CPUContext>::VADD(int n, const T *x, const T *y, T *z) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::VADD(n, x, y, z);
-#else
-  if (x == z) {
-    this->template AXPY<T>(n, (T)(1.), y, z);
-  } else {
-    this->template VCOPY<T>(n, y, z);
-    this->template AXPY<T>(n, (T)(1.), x, z);
-  }
-#endif
-}
-
-template <>
-template <typename T>
-void Blas<phi::CPUContext>::VSUB(int n, const T *x, const T *y, T *z) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::VSUB(n, x, y, z);
-#else
-  // try to find if openblas support vsub
-  for (int i = 0; i < n; ++i) {
-    z[i] = x[i] - y[i];
-  }
-#endif
-}
-
-template <>
-template <typename T>
-void Blas<phi::CPUContext>::VMUL(int n, const T *x, const T *y, T *z) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::VMUL(n, x, y, z);
-#else
-  // try to find if openblas support vmul
-  for (int i = 0; i < n; ++i) {
-    z[i] = x[i] * y[i];
-  }
-#endif
-}
-
-template <>
-template <typename T>
-void Blas<phi::CPUContext>::VDIV(int n, const T *x, const T *y, T *z) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::VDIV(n, x, y, z);
-#else
-  // try to find if openblas support vdiv
-  for (int i = 0; i < n; ++i) {
-    z[i] = x[i] / y[i];
-  }
-#endif
-}
-
-template <>
-template <typename T>
-void Blas<phi::CPUContext>::VEXP(int n, const T *x, T *y) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::VEXP(n, x, y);
-#else
-  // try to find if openblas support vexp
-  for (int i = 0; i < n; ++i) {
-    y[i] = std::exp(x[i]);
-  }
-#endif
-}
-
-template <>
-template <typename T>
-void Blas<phi::CPUContext>::VSQUARE(int n, const T *x, T *y) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::VSQUARE(n, x, y);
-#else
-  for (int i = 0; i < n; ++i) {
-    y[i] = x[i] * x[i];
-  }
-#endif
-}
-
-template <>
-template <typename T>
-void Blas<phi::CPUContext>::VPOW(int n, const T *x, T a, T *y) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::VPOW(n, x, a, y);
-#else
-  for (int i = 0; i < n; ++i) {
-    y[i] = std::pow(x[i], a);
-  }
-#endif
-}
-
-template <>
-template <typename T>
-T Blas<phi::CPUContext>::DOT(int n, const T *x, const T *y) const {
-#ifdef PADDLE_WITH_MKLML
-  return CBlas<T>::DOT(n, x, 1, y, 1);
-#else
-  // try to find if openblas support cblas_dot
-  T sum = 0;
-  for (int i = 0; i < n; ++i) {
-    sum += x[i] * y[i];
-  }
-  return sum;
-#endif
-}
-
-template <>
-template <typename T>
-void Blas<phi::CPUContext>::SCAL(int n, const T a, T *x) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::SCAL(n, a, x, 1);
-#else
-  // try to find if openblas support cblas_scal
-  for (int i = 0; i < n; ++i) {
-    x[i] = a * x[i];
-  }
-#endif
-}
-
-template <>
-template <typename T>
-T Blas<phi::CPUContext>::ASUM(int n, T *x, int inc) const {
-  auto sum = static_cast<T>(0.0);
-#ifdef PADDLE_WITH_MKLML
-  sum = CBlas<T>::ASUM(n, x, inc);
-#else
-  // TODO(jczaja): check if openblas does provide cblas_sasum/cblas_dasum
-  for (int c = 0; c < n; ++c) {
-    sum += x[c];
-  }
-#endif
-  return sum;
-}
-
-template <>
-template <typename T>
-void Blas<phi::CPUContext>::GEMV(bool trans_a,
-                                 int M,
-                                 int N,
-                                 T alpha,
-                                 const T *A,
-                                 const T *B,
-                                 T beta,
-                                 T *C) const {
-  CBLAS_TRANSPOSE transA = !trans_a ? CblasNoTrans : CblasTrans;
-  CBlas<T>::GEMV(CblasRowMajor, transA, M, N, alpha, A, N, B, 1, beta, C, 1);
-}
-
-template <>
-template <typename T>
-void Blas<phi::CPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
-                                        CBLAS_TRANSPOSE transB,
-                                        int64_t M,
-                                        int64_t N,
-                                        int64_t K,
-                                        T alpha,
-                                        const T *A,
-                                        const T *B,
-                                        T beta,
-                                        T *C,
-                                        int64_t batchCount,
-                                        int64_t strideA,
-                                        int64_t strideB) const {
-  PADDLE_ENFORCE_NOT_NULL(
-      A, phi::errors::InvalidArgument("Pointer A should not be null."));
-  PADDLE_ENFORCE_NOT_NULL(
-      B, phi::errors::InvalidArgument("Pointer B should not be null."));
-  PADDLE_ENFORCE_NOT_NULL(
-      C, phi::errors::InvalidArgument("Pointer C should not be null."));
-
-  if (M > INT_MAX_VALUE || N > INT_MAX_VALUE || K > INT_MAX_VALUE) {
-    PADDLE_THROW(
-        common::errors::Unimplemented("CPU GEMM not supported for large tensor "
-                                      "size."));
-  }
-
-#ifdef PADDLE_WITH_MKLML
-  if (batchCount > INT_MAX_VALUE) {
-    PADDLE_THROW(common::errors::Unimplemented(
-        "CPU GEMM not supported for large batch size in MKLML."));
-  }
-
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  auto a_array = std::vector<const T *>(batchCount);
-  auto b_array = std::vector<const T *>(batchCount);
-  auto c_array = std::vector<T *>(batchCount);
-  for (int k = 0; k < batchCount; ++k) {
-    a_array[k] = &A[k * strideA];
-    b_array[k] = &B[k * strideB];
-    c_array[k] = &C[k * M * N];
-  }
-
-  CBlas<T>::GEMM_BATCH(CblasRowMajor,
-                       &transA,
-                       &transB,
-                       reinterpret_cast<int *>(&M),
-                       reinterpret_cast<int *>(&N),
-                       reinterpret_cast<int *>(&K),
-                       &alpha,
-                       a_array.data(),
-                       &lda,
-                       b_array.data(),
-                       &ldb,
-                       &beta,
-                       c_array.data(),
-                       &ldc,
-                       1 /* group_count */,
-                       reinterpret_cast<int *>(&batchCount));
-#else
-  for (int k = 0; k < batchCount; ++k) {
-    auto *Ak = &A[k * strideA];
-    auto *Bk = &B[k * strideB];
-    auto *Ck = &C[k * M * N];
-    this->template GEMM<T>(transA,
-                           transB,
-                           reinterpret_cast<int *>(M),
-                           reinterpret_cast<int *>(N),
-                           reinterpret_cast<int *>(K),
-                           alpha,
-                           Ak,
-                           Bk,
-                           beta,
-                           Ck);
-  }
-#endif
-}
-
-template <>
-template <typename T>
-void Blas<phi::CPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
-                                        CBLAS_TRANSPOSE transB,
-                                        int M,
-                                        int N,
-                                        int K,
-                                        T alpha,
-                                        const T **A,
-                                        const T **B,
-                                        T beta,
-                                        T **C,
-                                        int batchCount) const {
-#ifdef PADDLE_WITH_MKLML
-  const int lda = (std::max)((transA == CblasNoTrans) ? K : M, 1);
-  const int ldb = (std::max)((transB == CblasNoTrans) ? N : K, 1);
-  const int ldc = (std::max)(N, 1);
-  CBlas<T>::GEMM_BATCH(CblasRowMajor,
-                       &transA,
-                       &transB,
-                       &M,
-                       &N,
-                       &K,
-                       &alpha,
-                       A,
-                       &lda,
-                       B,
-                       &ldb,
-                       &beta,
-                       C,
-                       &ldc,
-                       1 /* group_count */,
-                       &batchCount);
-#else
-  for (int k = 0; k < batchCount; ++k) {
-    this->template GEMM<T>(
-        transA, transB, M, N, K, alpha, A[k], B[k], beta, C[k]);
-  }
-#endif
-}
-
-#if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
-    !defined(PADDLE_WITH_HIP)  // @{ Group Blas MKLML: BatchedGEMMWithHead
-template <>
-template <typename T>
-void Blas<phi::CPUContext>::BatchedGEMMWithHead(CBLAS_TRANSPOSE transA,
-                                                CBLAS_TRANSPOSE transB,
-                                                int W1,
-                                                int H1,
-                                                int W2,
-                                                int H2,
-                                                T alpha,
-                                                const T *A,
-                                                const T *B,
-                                                T beta,
-                                                T *C,
-                                                int batchCount,
-                                                int64_t strideA,
-                                                int64_t strideB,
-                                                int64_t head_number,
-                                                bool split_b_vertical) const {
-  int lda = (transA == CblasNoTrans) ? W1 : H1;
-  int ldb = (transB == CblasNoTrans) ? W2 : H2;
-  auto a_array = std::vector<const T *>(batchCount);
-  auto b_array = std::vector<const T *>(batchCount);
-  auto c_array = std::vector<T *>(batchCount);
-
-  if (split_b_vertical) {
-    int ldc = W2;
-    int sub_width = W2 / head_number;
-
-    for (int i = 0; i < head_number; i++) {
-      int sub_matA_offset = (transA == CblasNoTrans)
-                                ? i * (W1 / head_number)
-                                : i * (W1 / head_number) * H1;
-      int sub_matB_offset = (transB == CblasNoTrans)
-                                ? i * (W2 / head_number)
-                                : i * (W2 / head_number) * H2;
-      int sub_matC_offset = i * W2 / head_number;
-      for (int k = 0; k < batchCount; ++k) {
-        a_array[k] = &A[k * strideA] + sub_matA_offset;
-        b_array[k] = &B[k * strideB] + sub_matB_offset;
-        c_array[k] = &C[k * H1 * W2] + sub_matC_offset;
-      }
-
-      CBlas<T>::GEMM_BATCH(CblasRowMajor,
-                           &transA,
-                           &transB,
-                           &H1,
-                           &sub_width,
-                           &H2,
-                           &alpha,
-                           a_array.data(),
-                           &lda,
-                           b_array.data(),
-                           &ldb,
-                           &beta,
-                           c_array.data(),
-                           &ldc,
-                           1 /* group_count */,
-                           &batchCount);
-    }
-
-  } else {
-    PADDLE_ENFORCE_EQ(
-        W1,
-        H2,
-        phi::errors::InvalidArgument(
-            "The fisrt matrix width should be same as second matrix height,"
-            "but received fisrt matrix width %d"
-            ", second matrix height %d",
-            W1,
-            H2));
-    int ldc = W2 * head_number;
-    int sub_width = W1 / head_number;
-
-    for (int i = 0; i < head_number; i++) {
-      int sub_matA_offset = (transA == CblasNoTrans)
-                                ? i * (W1 / head_number)
-                                : i * (W1 / head_number) * H1;
-      int sub_matB_offset = (transB == CblasNoTrans)
-                                ? i * (W1 / head_number) * W2
-                                : i * (W1 / head_number);
-      int sub_matC_offset = i * W2;
-      for (int k = 0; k < batchCount; ++k) {
-        a_array[k] = &A[k * strideA] + sub_matA_offset;
-        b_array[k] = &B[k * strideB] + sub_matB_offset;
-        c_array[k] = &C[k * H1 * head_number * W2] + sub_matC_offset;
-      }
-
-      CBlas<T>::GEMM_BATCH(CblasRowMajor,
-                           &transA,
-                           &transB,
-                           &H1,
-                           &W2,
-                           &sub_width,
-                           &alpha,
-                           a_array.data(),
-                           &lda,
-                           b_array.data(),
-                           &ldb,
-                           &beta,
-                           c_array.data(),
-                           &ldc,
-                           1 /* group_count */,
-                           &batchCount);
-    }
-  }
-}
-#endif  // @} End Group Blas MKLML: BatchedGEMMWithHead
-
-template <typename DeviceContext>
-template <typename T>
-void Blas<DeviceContext>::MatMul(
-    const int M, const int N, const int K, const T *A, const T *B, T *C) const {
-  this->template GEMM<T>(CblasRowMajor,
-                         CblasNoTrans,
-                         CblasNoTrans,
-                         M,
-                         N,
-                         K,
-                         static_cast<T>(1),
-                         A,
-                         K,
-                         B,
-                         N,
-                         static_cast<T>(0),
-                         C,
-                         N);
-}
-
-template <>
-template <typename T>
-void Blas<phi::CPUContext>::MatMul(
-    const int M, const int N, const int K, const T *A, const T *B, T *C) const {
-#ifdef PADDLE_WITH_LIBXSMM
-  // Refer to https://github.com/hfp/libxsmm/blob/master/README.md
-  // But the threshold is custom constexpr int LIBXSMM_THRESHOLD = 20 * 20 * 20;
-
-  // Since the matrix is very small,
-  // so the unit of calculation is already very fast,
-  // and the if( M*N*K < LIBXSMM_THRESHOLD) would be overhead,
-  // use xsmm directly.
-  // Note: SMM use ColMajor
-  const char transa = 'N';
-  const char transb = 'N';
-  const T alpha = static_cast<T>(1);
-  const T beta = static_cast<T>(0);
-  CBlas<T>::SMM_GEMM(
-      &transa, &transb, &N, &M, &K, &alpha, B, &N, A, &K, &beta, C, &N);
-  return;
-#endif
-
-  CBlas<T>::GEMM(CblasRowMajor,
-                 CblasNoTrans,
-                 CblasNoTrans,
-                 M,
-                 N,
-                 K,
-                 static_cast<T>(1),
-                 A,
-                 K,
-                 B,
-                 N,
-                 static_cast<T>(0),
-                 C,
-                 N);
-}
-
-template <typename DeviceContext>
-template <typename T>
-void Blas<DeviceContext>::MatMul(const phi::DenseTensor &mat_a,
-                                 const MatDescriptor &dim_a,
-                                 const phi::DenseTensor &mat_b,
-                                 const MatDescriptor &dim_b,
-                                 T alpha,
-                                 phi::DenseTensor *mat_out,
-                                 T beta) const {
-  MatMul(mat_a.data<T>(),
-         dim_a,
-         mat_b.data<T>(),
-         dim_b,
-         alpha,
-         mat_out->data<T>(),
-         beta);
-}
-
-template <typename DeviceContext>
-template <typename T>
-void Blas<DeviceContext>::MatMul(const T *mat_a,
-                                 const MatDescriptor &dim_a,
-                                 const T *mat_b,
-                                 const MatDescriptor &dim_b,
-                                 T alpha,
-                                 T *mat_out,
-                                 T beta) const {
-  PADDLE_ENFORCE_EQ(
-      dim_a.width_,
-      dim_b.height_,
-      phi::errors::InvalidArgument(
-          "The fisrt matrix width should be same as second matrix height,"
-          "but received fisrt matrix width %d"
-          ", second matrix height %d",
-          dim_a.width_,
-          dim_b.height_));
-
-  CBLAS_TRANSPOSE transA = !dim_a.trans_ ? CblasNoTrans : CblasTrans;
-  CBLAS_TRANSPOSE transB = !dim_b.trans_ ? CblasNoTrans : CblasTrans;
-  if (dim_a.batch_size_ == 0 && dim_b.batch_size_ == 0) {
-    this->template GEMM<T>(transA,
-                           transB,
-                           dim_a.height_,
-                           dim_b.width_,
-                           dim_a.width_,
-                           alpha,
-                           mat_a,
-                           mat_b,
-                           beta,
-                           mat_out);
-  } else {
-    PADDLE_ENFORCE_EQ(
-        dim_a.batch_size_ == dim_b.batch_size_ || dim_a.batch_size_ == 0 ||
-            dim_b.batch_size_ == 0,
-        true,
-        phi::errors::InvalidArgument(
-            "dim_a.batch_size should be equal to dim_b.batch_size, or "
-            "one of dim_a.batch_size and dim_b.batch_size should be 0. "
-            "But got dim_a.batch_size = %d, dim_b.batch_size = %d.",
-            dim_a.batch_size_,
-            dim_b.batch_size_));
-    this->template BatchedGEMM<T>(
-        transA,
-        transB,
-        dim_a.height_,
-        dim_b.width_,
-        dim_a.width_,
-        alpha,
-        mat_a,
-        mat_b,
-        beta,
-        mat_out,
-        dim_a.batch_size_ == 0 ? dim_b.batch_size_ : dim_a.batch_size_,
-        dim_a.stride_,
-        dim_b.stride_);
-  }
-}
-
-#if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
-    !defined(PADDLE_WITH_HIP)
-// @{ Group Blas MKLML: MatMulWithHead
-/*
- * Multiple two matrixes with multiple heads
- *
- * A new parameter, i.e head_number is added compared to normal MatMul.
- * The head_number describes the number of heads a matrix is vertically
- * split.
- *
- * When user calls this API, the multiplication of two big matrixes is split
- * into multiplication of several (head_number_) small matrixes. e.g. if Mat A
- * is [3, 24] and Mat B is [24, 4], when multiple A and B with head_number as
- * 4, Mat A will be split as 4 matrix of [3, 6] and Mat B will be
- * (horizontally) split as 4 matrix of [6, 4]. The result of final matrix
- * will be 4 matrix of [3, 4], i.e. [3, 16].
- * Another example is A is [3, 8], B is [2, 16], head_number is 4. In this
- * case, A will be split as [3, 2], B will be (vertically) split as
- * [2, 4]. The final result will be 4 matrix of 4 matrix of [3,4], i.e. [3, 16]
- */
-template <typename DeviceContext>
-template <typename T>
-void Blas<DeviceContext>::MatMulWithHead(const phi::DenseTensor &mat_a,
-                                         const MatDescriptor &dim_a,
-                                         const phi::DenseTensor &mat_b,
-                                         const MatDescriptor &dim_b,
-                                         T alpha,
-                                         int head_number,
-                                         phi::DenseTensor *mat_out,
-                                         T beta,
-                                         bool mat_b_split_vertical) const {
-  PADDLE_ENFORCE_EQ(
-      dim_a.width_ % head_number,
-      0,
-      phi::errors::InvalidArgument(
-          "The first input width must be some times the head number"
-          "but received first input width %d"
-          ",  head_number %d",
-          dim_a.width_,
-          head_number));
-  PADDLE_ENFORCE_GE(
-      head_number,
-      1,
-      phi::errors::InvalidArgument("The head number should be greater equal 1,"
-                                   "but received head number %d",
-                                   head_number));
-  PADDLE_ENFORCE_LE(
-      head_number,
-      dim_a.width_,
-      phi::errors::InvalidArgument(
-          "The head number should be less equal first input width,"
-          "but received first input width %d"
-          ",  head_number %d",
-          dim_a.width_,
-          head_number));
-  CBLAS_TRANSPOSE transA = !dim_a.trans_ ? CblasNoTrans : CblasTrans;
-  CBLAS_TRANSPOSE transB = !dim_b.trans_ ? CblasNoTrans : CblasTrans;
-
-  if (mat_b_split_vertical) {
-    PADDLE_ENFORCE_EQ(
-        dim_b.height_,
-        dim_a.width_ / head_number,
-        phi::errors::InvalidArgument(
-            "The second input height should be equal than first input width,"
-            "but received second input height %d, first input width %d",
-            dim_b.height_,
-            dim_a.width_ / head_number));
-    PADDLE_ENFORCE_EQ(
-        dim_a.width_ % head_number,
-        0,
-        phi::errors::InvalidArgument(
-            "The second input width should be some times the head number"
-            "but received second input width %d"
-            ",  head_number %d",
-            dim_b.width_,
-            head_number));
-  }
-
-  if (dim_a.batch_size_ == 0 && dim_b.batch_size_ == 0) {
-    int lda = !dim_a.trans_ ? dim_a.width_ : dim_a.height_;
-    int ldb = !dim_b.trans_ ? dim_b.width_ : dim_b.height_;
-    int sub_matA_offset;
-    int sub_matB_offset;
-    int sub_matC_offset;
-    int sub_mat_M = dim_a.height_;
-    int sub_mat_N;
-    int sub_mat_K;
-    int ldc;
-
-    for (int i = 0; i < head_number; i++) {
-      sub_matA_offset = dim_a.trans_
-                            ? i * (dim_a.width_ / head_number) * dim_a.height_
-                            : i * (dim_a.width_ / head_number);
-      if (mat_b_split_vertical) {
-        sub_matB_offset = dim_b.trans_
-                              ? i * (dim_b.width_ / head_number) * dim_b.height_
-                              : i * (dim_b.width_ / head_number);
-        sub_matC_offset = i * dim_b.width_ / head_number;
-
-        sub_mat_N = dim_b.width_ / head_number;
-        sub_mat_K = dim_b.height_;
-
-        ldc = dim_b.width_;
-      } else {
-        sub_matB_offset =
-            dim_b.trans_ ? i * (dim_b.height_ / head_number)
-                         : i * (dim_b.height_ / head_number) * dim_b.width_;
-        sub_matC_offset = i * dim_b.width_;
-
-        sub_mat_N = dim_b.width_;
-        sub_mat_K = dim_a.width_ / head_number;
-
-        ldc = head_number * dim_b.width_;
-      }
-
-      this->template GEMM<T>(transA,
-                             transB,
-                             sub_mat_M,
-                             sub_mat_N,
-                             sub_mat_K,
-                             alpha,
-                             mat_a.data<T>() + sub_matA_offset,
-                             lda,
-                             mat_b.data<T>() + sub_matB_offset,
-                             ldb,
-                             beta,
-                             mat_out->data<T>() + sub_matC_offset,
-                             ldc);
-    }
-  } else {
-    PADDLE_ENFORCE_EQ(
-        (dim_a.batch_size_ == dim_b.batch_size_ || dim_a.batch_size_ == 0 ||
-         dim_b.batch_size_ == 0),
-        true,
-        phi::errors::InvalidArgument(
-            "The first input batch size should be equal than second input,"
-            "either two input batch size is 0, but received first input batch "
-            "size"
-            " %d, second input batch size %d",
-            dim_a.batch_size_,
-            dim_b.batch_size_));
-
-    this->template BatchedGEMMWithHead<T>(
-        transA,
-        transB,
-        dim_a.width_,
-        dim_a.height_,
-        dim_b.width_,
-        dim_b.height_,
-        alpha,
-        mat_a.data<T>(),
-        mat_b.data<T>(),
-        beta,
-        mat_out->data<T>(),
-        dim_a.batch_size_ == 0 ? dim_b.batch_size_ : dim_a.batch_size_,
-        dim_a.stride_,
-        dim_b.stride_,
-        head_number,
-        mat_b_split_vertical);
-  }
-}
-#endif  // @} End Group Blas MKLML: MatMulWithHead
-
-template <typename DeviceContext>
-template <typename T>
-void Blas<DeviceContext>::VINV(int n, const T *a, T *y) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::VINV(n, a, y);
-#else
-  for (int i = 0; i < n; ++i) {
-    y[i] = 1.0 / a[i];
-  }
-#endif
-}
-
-template <>
-template <typename T>
-void Blas<phi::CPUContext>::VMERF(int n, const T *a, T *y, int64_t mode) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::VMERF(n, a, y, mode);
-#else
-  for (int i = 0; i < n; ++i) {
-    y[i] = std::erf(a[i]);
-  }
-#endif
-}
-
-#ifdef PADDLE_WITH_MKLML
-template <>
-template <typename T>
-void Blas<phi::CPUContext>::CSRMM(const char *transa,
-                                  const int *m,
-                                  const int *n,
-                                  const int *k,
-                                  const T *alpha,
-                                  const char *matdescra,
-                                  const T *val,
-                                  const int *indx,
-                                  const int *pntrb,
-                                  const int *pntre,
-                                  const T *b,
-                                  const int *ldb,
-                                  const T *beta,
-                                  T *c,
-                                  const int *ldc) const {
-  CBlas<T>::CSRMM(transa,
-                  m,
-                  n,
-                  k,
-                  alpha,
-                  matdescra,
-                  val,
-                  indx,
-                  pntrb,
-                  pntre,
-                  b,
-                  ldb,
-                  beta,
-                  c,
-                  ldc);
-}
-#endif
-
-template <>
-template <typename T>
-void Blas<phi::CPUContext>::TRSM(CBLAS_SIDE side,
-                                 CBLAS_UPLO uplo,
-                                 CBLAS_TRANSPOSE transA,
-                                 CBLAS_DIAG diag,
-                                 int M,
-                                 int N,
-                                 T alpha,
-                                 const T *A,
-                                 int lda,
-                                 T *B,
-                                 int ldb) const {
-  CBlas<T>::TRSM(
-      CblasRowMajor, side, uplo, transA, diag, M, N, alpha, A, lda, B, ldb);
-}
-
-}  // namespace funcs
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/funcs/blas/blaslt_gemm_search.h b/backends/metax_gpu/kernels/funcs/blas/blaslt_gemm_search.h
deleted file mode 100644
index 6dcc56f8569..00000000000
--- a/backends/metax_gpu/kernels/funcs/blas/blaslt_gemm_search.h
+++ /dev/null
@@ -1,794 +0,0 @@
-// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights
-// Reserved.
-/* Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <glog/logging.h>
-
-#include <limits>
-#include <sstream>
-#include <string>
-#include <unordered_map>
-
-#include "paddle/common/flags.h"
-#include "paddle/phi/api/include/context_pool.h"
-#include "paddle/phi/backends/dynload/cublasLt.h"
-#include "paddle/phi/backends/gpu/gpu_info.h"
-#include "paddle/phi/common/data_type.h"
-#include "paddle/phi/common/memory_utils.h"
-#include "paddle/phi/common/place.h"
-#include "paddle/phi/core/allocator.h"
-#include "paddle/phi/core/dense_tensor.h"
-
-COMMON_DECLARE_string(cublaslt_device_best_config);
-
-namespace phi {
-namespace funcs {
-namespace cublaslt_internal {
-
-const std::array<int, 9> split_k_candidates = {2, 3, 4, 5, 6, 8, 12, 16, 32};
-
-struct CublasLtAlgoConfig {
-  int m;
-  int n;
-  int k;
-  int algo_id;
-  int swizzle;
-  int custom_option;
-  int tile;
-  int split_k_val;
-  int reduction_scheme;
-  int stages;
-};
-
-struct CublasLtAlgoSelectorParam {
-  float time{0.0};
-  cublasLtMatmulAlgo_t algo;
-  CublasLtAlgoConfig algo_config;
-};
-
-inline bool compare_algo_time(const CublasLtAlgoSelectorParam& param_a,
-                              const CublasLtAlgoSelectorParam& param_b) {
-  return (param_a.time < param_b.time);
-}
-
-class CublasLtAlgoCache {
- public:
-  static CublasLtAlgoCache& Instance() {
-    static CublasLtAlgoCache instance(100 /*search_times*/);
-    return instance;
-  }
-
-  template <typename InT, typename OutT>
-  void RunAndMeasureAlgo(cublasLtHandle_t handle,
-                         cublasLtMatmulDesc_t matmul_desc,
-                         cublasLtMatrixLayout_t a_desc,
-                         cublasLtMatrixLayout_t b_desc,
-                         cublasLtMatrixLayout_t bias_desc,
-                         cublasLtMatrixLayout_t c_desc,
-                         void* alpha,
-                         void* beta,
-                         const InT* a,
-                         const InT* b,
-                         const OutT* bias,
-                         OutT* c,
-                         CublasLtAlgoSelectorParam& param,  // NOLINT
-                         cudaEvent_t& start_event,          // NOLINT
-                         cudaEvent_t& stop_event,           // NOLINT
-                         cudaStream_t stream) {
-    cublasStatus_t status;
-    cublasLtMatmulHeuristicResult_t heuristic_result;
-    status = dynload::cublasLtMatmulAlgoCheck(handle,
-                                              matmul_desc,
-                                              a_desc,
-                                              b_desc,
-                                              bias_desc,
-                                              c_desc,
-                                              &param.algo,
-                                              &heuristic_result);
-    PADDLE_ENFORCE_GPU_SUCCESS(status);
-    if (status != CUBLAS_STATUS_SUCCESS) {
-      param.time = std::numeric_limits<float>::max();
-      return;
-    }
-    size_t workspace_size = heuristic_result.workspaceSize;
-    auto workspace = phi::memory_utils::Alloc(
-        phi::GPUPlace(phi::backends::gpu::GetCurrentDeviceId()),
-        workspace_size,
-        phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
-
-    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(start_event, stream));
-    int repeats = search_times_;
-
-    for (int loop = 0; loop < repeats; loop++) {
-      status = dynload::cublasLtMatmul(handle,
-                                       matmul_desc,
-                                       alpha,
-                                       a,
-                                       a_desc,
-                                       b,
-                                       b_desc,
-                                       beta,
-                                       bias,
-                                       bias_desc,
-                                       c,
-                                       c_desc,
-                                       &param.algo,
-                                       workspace->ptr(),
-                                       workspace_size,
-                                       stream);
-      if (status != CUBLAS_STATUS_SUCCESS) {
-        param.time = std::numeric_limits<float>::max();
-        return;
-      }
-    }
-
-    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(stop_event, stream));
-    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
-
-    float time;
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        cudaEventElapsedTime(&time, start_event, stop_event));
-
-    param.time = time / repeats;
-  }
-
-  template <typename InT, typename OutT>
-  cublasLtMatmulAlgo_t* CublasLtAlgoSelect(cublasLtHandle_t handle,
-                                           int m,
-                                           int n,
-                                           int k,
-                                           int batch_count,
-                                           const InT* a,
-                                           const InT* b,
-                                           const OutT* bias,
-                                           OutT* c,
-                                           void* alpha,
-                                           void* beta,
-                                           cublasLtMatmulDesc_t matmul_desc,
-                                           cublasLtMatrixLayout_t a_desc,
-                                           cublasLtMatrixLayout_t b_desc,
-                                           cublasLtMatrixLayout_t bias_desc,
-                                           cublasLtMatrixLayout_t c_desc,
-                                           cublasComputeType_t compute_type,
-                                           cudaDataType_t scale_type,
-                                           cudaDataType_t a_type,
-                                           cudaDataType_t b_type,
-                                           cudaDataType_t bias_type,
-                                           cudaDataType_t c_type,
-                                           cudaStream_t stream) {
-    // If we don't have config file and we do not search, here return nullptr
-    if (!has_config_file_ && search_times_ <= 0) {
-      return nullptr;
-    }
-
-    // VLOG(0) << "m n k: " << m << " " << n << " " << k;
-
-    int64_t seed = 0;
-    std::hash<int64_t> hash_fn;
-
-    HashMatmulDesc(matmul_desc, &seed, hash_fn);
-    HashMatrixLayoutDesc(a_desc, &seed, hash_fn);
-    HashMatrixLayoutDesc(b_desc, &seed, hash_fn);
-    HashMatrixLayoutDesc(bias_desc, &seed, hash_fn);
-    HashMatrixLayoutDesc(c_desc, &seed, hash_fn);
-
-    {
-      std::lock_guard<std::mutex> lock(cache_mutex_);
-      if (algo_caches_.count(seed)) {
-        VLOG(3) << "CublasLtAlgoSelect Found in cache";
-        return &algo_caches_[seed];
-      }
-    }
-
-    if (search_configs_.empty()) {
-      std::ifstream infile;
-      std::string config_file_path = FLAGS_cublaslt_device_best_config;
-      infile.open(config_file_path.c_str());
-      if (infile.is_open()) {
-        size_t workspace_size;
-        float time;
-        char comma;
-        while (!infile.eof()) {
-          CublasLtAlgoConfig search_config;
-          infile >> search_config.m >> comma >> search_config.k >> comma >>
-              search_config.n >> comma >> search_config.algo_id >> comma >>
-              search_config.swizzle >> comma >> search_config.custom_option >>
-              comma >> search_config.tile >> comma >>
-              search_config.split_k_val >> comma >>
-              search_config.reduction_scheme >> comma >> search_config.stages >>
-              comma >> workspace_size >> comma >> time;
-          search_configs_.push_back(search_config);
-        }
-        infile.close();
-        VLOG(3) << "Loaded " << search_configs_.size() << " configs";
-      }
-    }
-    if (!search_configs_.empty()) {
-      auto configure_algo = [&](const CublasLtAlgoConfig& search_config)
-          -> cublasLtMatmulAlgo_t* {
-        cublasLtMatmulAlgo_t algo;
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            dynload::cublasLtMatmulAlgoInit(handle,
-                                            compute_type,
-                                            scale_type,
-                                            b_type,
-                                            a_type,
-                                            c_type,
-                                            c_type,
-                                            search_config.algo_id,
-                                            &algo));
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            dynload::cublasLtMatmulAlgoConfigSetAttribute(
-                &algo,
-                CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION,
-                &search_config.custom_option,
-                sizeof(search_config.custom_option)));
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            dynload::cublasLtMatmulAlgoConfigSetAttribute(
-                &algo,
-                CUBLASLT_ALGO_CONFIG_TILE_ID,
-                &search_config.tile,
-                sizeof(search_config.tile)));
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            dynload::cublasLtMatmulAlgoConfigSetAttribute(
-                &algo,
-                CUBLASLT_ALGO_CONFIG_SPLITK_NUM,
-                &search_config.split_k_val,
-                sizeof(search_config.split_k_val)));
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            dynload::cublasLtMatmulAlgoConfigSetAttribute(
-                &algo,
-                CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING,
-                &search_config.swizzle,
-                sizeof(search_config.swizzle)));
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            dynload::cublasLtMatmulAlgoConfigSetAttribute(
-                &algo,
-                CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
-                &search_config.reduction_scheme,
-                sizeof(search_config.reduction_scheme)));
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            dynload::cublasLtMatmulAlgoConfigSetAttribute(
-                &algo,
-                CUBLASLT_ALGO_CONFIG_STAGES_ID,
-                &search_config.stages,
-                sizeof(search_config.stages)));
-        std::lock_guard<std::mutex> lock(cache_mutex_);
-        algo_caches_[seed] = algo;
-        return &algo_caches_[seed];
-      };
-      const CublasLtAlgoConfig* pre = nullptr;
-      for (size_t i = 0; i < search_configs_.size(); i++) {
-        if (search_configs_[i].n == n && search_configs_[i].k == k &&
-            m <= search_configs_[i].m) {
-          return configure_algo(search_configs_[i]);
-        } else if (search_configs_[i].n == n && search_configs_[i].k == k &&
-                   m > search_configs_[i].m) {
-          if (pre == nullptr || pre->m < search_configs_[i].m)
-            pre = &search_configs_[i];
-        }
-      }
-      if (pre != nullptr) {
-        // use max m in file
-        return configure_algo(*pre);
-      }
-    }
-
-    // if we have cache but not found algo, and we don't want to search,
-    // here return nullptr
-    if (search_times_ <= 0) {
-      return nullptr;
-    }
-
-    VLOG(3) << "CublasLtAlgoSelect Not Found in cache";
-
-    // Get Ids
-    // https://docs.nvidia.com/cuda/cublas/index.html#cublasLtMatmulAlgoGetIds
-    cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
-    int algo_ids[requested_algo_count_];  // NOLINT
-
-    int num_algo_ids;
-    status = dynload::cublasLtMatmulAlgoGetIds(handle,
-                                               compute_type,
-                                               scale_type,
-                                               a_type,
-                                               b_type,
-                                               bias_type,
-                                               c_type,
-                                               requested_algo_count_,
-                                               algo_ids,
-                                               &num_algo_ids);
-    PADDLE_ENFORCE_GPU_SUCCESS(status);
-
-    // Traverse all possible algo combinations
-    int step = 0;
-    int limit = 20000;
-    std::vector<CublasLtAlgoSelectorParam> params;
-
-    for (int idx = 0; idx < num_algo_ids; idx++) {
-      cublasLtMatmulAlgo_t algo;
-
-      /* Initialize algo structure with given Algp ID */
-      // https://docs.nvidia.com/cuda/cublas/index.html#cublasLtMatmulAlgoInit
-      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulAlgoInit(handle,
-                                                                 compute_type,
-                                                                 scale_type,
-                                                                 a_type,
-                                                                 b_type,
-                                                                 bias_type,
-                                                                 c_type,
-                                                                 algo_ids[idx],
-                                                                 &algo));
-
-      // Query the tiles enums supported by that algo which is used to alloc
-      // enough space to store it
-      // https://docs.nvidia.com/cuda/cublas/index.html#cublasLtMatmulAlgoCapGetAttribute
-      size_t attr_size = 0;
-
-      int batch_support;
-      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulAlgoCapGetAttribute(
-          &algo,
-          CUBLASLT_ALGO_CAP_STRIDED_BATCH_SUPPORT,
-          &batch_support,
-          sizeof(batch_support),
-          &attr_size));
-      if (batch_count > 1 && batch_support == 0) {
-        continue;
-      }
-
-      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulAlgoCapGetAttribute(
-          &algo, CUBLASLT_ALGO_CAP_TILE_IDS, nullptr, 0, &attr_size));
-
-      int num_tiles = static_cast<int>(attr_size / sizeof(int));
-      std::vector<int> tiles(num_tiles == 0 ? 1 : num_tiles);
-      if (num_tiles == 0) {
-        tiles[0] = CUBLASLT_MATMUL_TILE_UNDEFINED;
-        num_tiles = 1;
-      } else {
-        PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulAlgoCapGetAttribute(
-            &algo,
-            CUBLASLT_ALGO_CAP_TILE_IDS,
-            tiles.data(),
-            sizeof(int) * num_tiles,
-            &attr_size));
-      }
-
-      // Query the stages enums supported by that algo (cuda must >= 11.0)
-      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulAlgoCapGetAttribute(
-          &algo, CUBLASLT_ALGO_CAP_STAGES_IDS, nullptr, 0, &attr_size));
-      int num_stages = static_cast<int>(attr_size / sizeof(int));
-      std::vector<int> stages(num_stages == 0 ? 1 : num_stages);
-      if (num_stages == 0) {
-        stages[0] = CUBLASLT_MATMUL_STAGES_UNDEFINED;
-        num_stages = 1;
-      } else {
-        PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulAlgoCapGetAttribute(
-            &algo,
-            CUBLASLT_ALGO_CAP_STAGES_IDS,
-            stages.data(),
-            sizeof(int) * num_stages,
-            &attr_size));
-      }
-
-      // Retrieve Other Algo Capabilities attributes
-      int splitk_support, red_mask, swizzling_max, custom_option_max;
-      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulAlgoCapGetAttribute(
-          &algo,
-          CUBLASLT_ALGO_CAP_SPLITK_SUPPORT,
-          &splitk_support,
-          sizeof(splitk_support),
-          &attr_size));
-      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulAlgoCapGetAttribute(
-          &algo,
-          CUBLASLT_ALGO_CAP_REDUCTION_SCHEME_MASK,
-          &red_mask,
-          sizeof(red_mask),
-          &attr_size));
-      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulAlgoCapGetAttribute(
-          &algo,
-          CUBLASLT_ALGO_CAP_CTA_SWIZZLING_SUPPORT,
-          &swizzling_max,
-          sizeof(swizzling_max),
-          &attr_size));
-      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulAlgoCapGetAttribute(
-          &algo,
-          CUBLASLT_ALGO_CAP_CUSTOM_OPTION_MAX,
-          &custom_option_max,
-          sizeof(custom_option_max),
-          &attr_size));
-
-      /* Loop over the different tiles */
-      for (int tile_id = 0; tile_id < num_tiles && step < limit; tile_id++) {
-        /* Loop over different stages count */
-        for (int stage_id = 0; stage_id < num_stages && step < limit;
-             stage_id++) {
-          /* Loop over the different custom option if any */
-          for (int custom_option = 0;
-               custom_option <= custom_option_max && step < limit;
-               custom_option++) {
-            /* Loop over the CTAs swizzling support */
-            for (int k = 0; k <= swizzling_max && step < limit; k++) {
-              int splir_k_trial = 0;
-              if (splitk_support) {
-                splir_k_trial +=
-                    sizeof(split_k_candidates) / sizeof(split_k_candidates[0]);
-              }
-
-              for (int l = 0; (l < (1 + splir_k_trial)) && (step < limit);
-                   l++) {
-                PADDLE_ENFORCE_GPU_SUCCESS(
-                    dynload::cublasLtMatmulAlgoConfigSetAttribute(
-                        &algo,
-                        CUBLASLT_ALGO_CONFIG_TILE_ID,
-                        &tiles[tile_id],
-                        sizeof(tiles[tile_id])));
-                PADDLE_ENFORCE_GPU_SUCCESS(
-                    dynload::cublasLtMatmulAlgoConfigSetAttribute(
-                        &algo,
-                        CUBLASLT_ALGO_CONFIG_STAGES_ID,
-                        &stages[stage_id],
-                        sizeof(stages[stage_id])));
-                PADDLE_ENFORCE_GPU_SUCCESS(
-                    dynload::cublasLtMatmulAlgoConfigSetAttribute(
-                        &algo,
-                        CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION,
-                        &custom_option,
-                        sizeof(custom_option)));
-                PADDLE_ENFORCE_GPU_SUCCESS(
-                    dynload::cublasLtMatmulAlgoConfigSetAttribute(
-                        &algo,
-                        CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING,
-                        &k,
-                        sizeof(k)));
-                int split_k_val = 1;
-                int reduction_scheme = CUBLASLT_REDUCTION_SCHEME_NONE;
-                PADDLE_ENFORCE_GPU_SUCCESS(
-                    dynload::cublasLtMatmulAlgoConfigSetAttribute(
-                        &algo,
-                        CUBLASLT_ALGO_CONFIG_SPLITK_NUM,
-                        &split_k_val,
-                        sizeof(split_k_val)));
-                PADDLE_ENFORCE_GPU_SUCCESS(
-                    dynload::cublasLtMatmulAlgoConfigSetAttribute(
-                        &algo,
-                        CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
-                        &reduction_scheme,
-                        sizeof(int)));
-                if (l > 0) {  // Split-K case
-                  split_k_val = split_k_candidates[l - 1];
-                  PADDLE_ENFORCE_GPU_SUCCESS(
-                      dynload::cublasLtMatmulAlgoConfigSetAttribute(
-                          &algo,
-                          CUBLASLT_ALGO_CONFIG_SPLITK_NUM,
-                          &split_k_candidates[l - 1],
-                          sizeof(split_k_candidates[l - 1])));
-                  for (reduction_scheme = 1;
-                       reduction_scheme <
-                           static_cast<int>(CUBLASLT_REDUCTION_SCHEME_MASK) &&
-                       (step < limit);
-                       reduction_scheme = reduction_scheme << 1) {
-                    if (reduction_scheme & red_mask) {
-                      PADDLE_ENFORCE_GPU_SUCCESS(
-                          dynload::cublasLtMatmulAlgoConfigSetAttribute(
-                              &algo,
-                              CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
-                              &reduction_scheme,
-                              sizeof(reduction_scheme)));
-
-                      cublasLtMatmulHeuristicResult_t heurResult;
-                      status = dynload::cublasLtMatmulAlgoCheck(handle,
-                                                                matmul_desc,
-                                                                a_desc,
-                                                                b_desc,
-                                                                bias_desc,
-                                                                c_desc,
-                                                                &algo,
-                                                                &heurResult);
-                      if (status == CUBLAS_STATUS_SUCCESS) {
-                        CublasLtAlgoSelectorParam param;
-                        param.algo = algo;
-                        param.algo_config.m = m;
-                        param.algo_config.n = n;
-                        param.algo_config.k = k;
-                        param.algo_config.algo_id = algo_ids[idx];
-                        param.algo_config.tile = tiles[tile_id];
-                        param.algo_config.swizzle = k;
-                        param.algo_config.custom_option = custom_option;
-                        param.algo_config.split_k_val = split_k_val;
-                        param.algo_config.reduction_scheme = reduction_scheme;
-                        param.algo_config.stages = stages[stage_id];
-                        params.emplace_back(param);
-                        step++;
-                      }
-                    }  // end if
-                  }
-                } else {
-                  // Prepare algos
-                  cublasLtMatmulHeuristicResult_t heurResult;
-                  // https://docs.nvidia.com/cuda/cublas/index.html#cublasLtMatmulAlgoCheck
-                  status = dynload::cublasLtMatmulAlgoCheck(handle,
-                                                            matmul_desc,
-                                                            a_desc,
-                                                            b_desc,
-                                                            bias_desc,
-                                                            c_desc,
-                                                            &algo,
-                                                            &heurResult);
-                  if (status == CUBLAS_STATUS_SUCCESS) {
-                    CublasLtAlgoSelectorParam param;
-                    param.algo = algo;
-                    param.algo_config.m = m;
-                    param.algo_config.n = n;
-                    param.algo_config.k = k;
-                    param.algo_config.algo_id = algo_ids[idx];
-                    param.algo_config.tile = tiles[tile_id];
-                    param.algo_config.swizzle = k;
-                    param.algo_config.custom_option = custom_option;
-                    param.algo_config.split_k_val = split_k_val;
-                    param.algo_config.reduction_scheme = reduction_scheme;
-                    param.algo_config.stages = stages[stage_id];
-                    params.emplace_back(param);
-                    step++;
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-    cudaEvent_t start_event;
-    cudaEvent_t stop_event;
-
-    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventCreate(&start_event));
-    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventCreate(&stop_event));
-
-    if (step == 0) {
-      VLOG(3) << "No algo can be used";
-      return nullptr;
-    }
-
-    VLOG(3) << "CublasLtAlgoSelect Start testRun " << step << " "
-            << params.size();
-
-    for (int i = 0; i < step; i++) {
-      RunAndMeasureAlgo(handle,
-                        matmul_desc,
-                        a_desc,
-                        b_desc,
-                        bias_desc,
-                        c_desc,
-                        alpha,
-                        beta,
-                        a,
-                        b,
-                        bias,
-                        c,
-                        params[i],
-                        start_event,
-                        stop_event,
-                        stream);
-    }
-    std::sort(params.begin(), params.end(), compare_algo_time);
-
-    size_t res_id = 0;
-    while (params[res_id].time == 0.0) {
-      res_id++;
-      if (res_id >= params.size()) break;
-    }
-
-    if (res_id >= params.size()) {
-      VLOG(3) << "No algo can be used";
-      return nullptr;
-    }
-
-    VLOG(3) << "algo selected";
-
-    std::lock_guard<std::mutex> lock(cache_mutex_);
-    algo_caches_[seed] = params[res_id].algo;
-    return &algo_caches_[seed];
-  }
-
-  ~CublasLtAlgoCache() { SerializeAlgoCachesToFile(); }
-
- private:
-  std::string algo_caches_file_{"./cublaslt_algo_caches_from_paddle"};
-  std::unordered_map<int64_t, cublasLtMatmulAlgo_t> algo_caches_;
-  std::vector<CublasLtAlgoConfig> search_configs_;
-  int search_times_;
-  static constexpr int requested_algo_count_ = 100;
-  std::mutex cache_mutex_;
-  bool has_config_file_;
-
-  explicit CublasLtAlgoCache(int search_times)
-      : search_times_(search_times), has_config_file_(true) {
-    // Init algo_caches_ from cache file
-    std::ifstream infile;
-    infile.open(algo_caches_file_);
-    if (!infile.is_open()) {
-      has_config_file_ = false;
-      VLOG(3) << "No CublasLtAlgoCache file found";
-      return;
-    }
-    size_t cublaslt_version = 0, real_cublaslt_version = 0;
-    int64_t seed = 0;
-    std::array<uint64_t, 8> algo_data;
-    infile >> cublaslt_version;
-    VLOG(1) << "cublaslt_version " << cublaslt_version;
-
-    if (dynload::cublasLtGetCudartVersion() != cublaslt_version) {
-      LOG(INFO) << algo_caches_file_
-                << " is not compatible with current cublaslt_version "
-                << real_cublaslt_version;
-      return;
-    }
-
-    while (!infile.eof()) {
-      infile >> seed >> algo_data[0] >> algo_data[1] >> algo_data[2] >>
-          algo_data[3] >> algo_data[4] >> algo_data[5] >> algo_data[6] >>
-          algo_data[7];
-
-      for (int i = 0; i < 8; ++i) {
-        algo_caches_[seed].data[i] = algo_data[i];
-      }
-    }
-    infile.close();
-  }
-
-  // Serialize algo_caches_ to cache file
-  void SerializeAlgoCachesToFile() {
-    if (search_times_ > 0) {
-      int dev;
-      cudaGetDevice(&dev);
-      if (dev == 0) {
-        std::ofstream outfile;
-        outfile.open(algo_caches_file_, std::ios::out | std::ios::trunc);
-        outfile << dynload::cublasLtGetCudartVersion() << std::endl;
-
-        for (const auto& [seed, algo] : algo_caches_) {
-          outfile << seed << " ";
-          for (size_t value : algo.data) {
-            outfile << value << " ";
-          }
-          outfile << std::endl;
-        }
-        outfile.close();
-      }
-    }
-  }
-
-  inline int64_t RoundToNextHighPowOfTwo(int64_t n, int64_t min_val) {
-    n--;
-    n |= (n >> 1);
-    n |= (n >> 2);
-    n |= (n >> 4);
-    n |= (n >> 8);
-    n |= (n >> 16);
-    return std::max(min_val, (n + 1));
-  }
-
-  void HashMatmulDesc(cublasLtMatmulDesc_t desc,
-                      int64_t* seed,
-                      const std::hash<int64_t>& hash_fn) {
-    size_t size_to_write;
-    int trans_a, trans_b;
-    uint32_t epilogue;
-    // int8_t fast_accum;
-
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        dynload::cublasLtMatmulDescGetAttribute(desc,
-                                                CUBLASLT_MATMUL_DESC_TRANSA,
-                                                &trans_a,
-                                                sizeof(trans_a),
-                                                &size_to_write));
-    HashValue(seed, hash_fn, static_cast<int64_t>(trans_a));
-
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        dynload::cublasLtMatmulDescGetAttribute(desc,
-                                                CUBLASLT_MATMUL_DESC_TRANSB,
-                                                &trans_b,
-                                                sizeof(trans_b),
-                                                &size_to_write));
-    HashValue(seed, hash_fn, static_cast<int64_t>(trans_b));
-
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        dynload::cublasLtMatmulDescGetAttribute(desc,
-                                                CUBLASLT_MATMUL_DESC_EPILOGUE,
-                                                &epilogue,
-                                                sizeof(epilogue),
-                                                &size_to_write));
-    HashValue(seed, hash_fn, static_cast<int64_t>(epilogue));
-
-    // PADDLE_ENFORCE_GPU_SUCCESS(
-    //     dyl::cublasLtMatmulDescGetAttribute(desc,
-    //                                         CUBLASLT_MATMUL_DESC_FAST_ACCUM,
-    //                                         &fast_accum,
-    //                                         sizeof(fast_accum),
-    //                                         &size_to_write));
-    // HashValue(seed, hash_fn, static_cast<int64_t>(fast_accum));
-  }
-
-  void HashMatrixLayoutDesc(cublasLtMatrixLayout_t desc,
-                            int64_t* seed,
-                            const std::hash<int64_t>& hash_fn) {
-    size_t size_to_write;
-    uint32_t dtype;
-    int32_t batch;
-    uint64_t row, col;
-    int64_t ld, batch_offset;
-
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        dynload::cublasLtMatrixLayoutGetAttribute(desc,
-                                                  CUBLASLT_MATRIX_LAYOUT_TYPE,
-                                                  &dtype,
-                                                  sizeof(dtype),
-                                                  &size_to_write));
-    HashValue(seed, hash_fn, static_cast<int64_t>(dtype));
-
-    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatrixLayoutGetAttribute(
-        desc,
-        CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT,
-        &batch,
-        sizeof(batch),
-        &size_to_write));
-    HashValue(seed, hash_fn, static_cast<int64_t>(batch));
-
-    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatrixLayoutGetAttribute(
-        desc, CUBLASLT_MATRIX_LAYOUT_ROWS, &row, sizeof(row), &size_to_write));
-    HashValue(seed, hash_fn, RoundToNextHighPowOfTwo(row, 32));
-
-    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatrixLayoutGetAttribute(
-        desc, CUBLASLT_MATRIX_LAYOUT_COLS, &col, sizeof(col), &size_to_write));
-    HashValue(seed, hash_fn, RoundToNextHighPowOfTwo(col, 32));
-
-    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatrixLayoutGetAttribute(
-        desc, CUBLASLT_MATRIX_LAYOUT_LD, &ld, sizeof(ld), &size_to_write));
-    HashValue(seed, hash_fn, RoundToNextHighPowOfTwo(ld, 32));
-
-    // PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutGetAttribute(
-    //     desc, CUBLASLT_MATRIX_LAYOUT_ROWS, &row, sizeof(row),
-    //     &size_to_write));
-    // HashValue(seed, hash_fn, row);
-
-    // PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutGetAttribute(
-    //     desc, CUBLASLT_MATRIX_LAYOUT_COLS, &col, sizeof(col),
-    //     &size_to_write));
-    // HashValue(seed, hash_fn, col);
-
-    // PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutGetAttribute(
-    //     desc, CUBLASLT_MATRIX_LAYOUT_LD, &ld, sizeof(ld), &size_to_write));
-    // HashValue(seed, hash_fn, ld);
-
-    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatrixLayoutGetAttribute(
-        desc,
-        CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
-        &batch_offset,
-        sizeof(batch_offset),
-        &size_to_write));
-    HashValue(seed, hash_fn, static_cast<int64_t>(batch_offset));
-  }
-
-  void HashValue(int64_t* seed,
-                 const std::hash<int64_t>& hash_fn,
-                 int64_t value) {
-    *seed ^= hash_fn(value) + 0x9e3779b9 + (*seed << 6) + (*seed >> 2);
-  }
-};
-
-}  // namespace cublaslt_internal
-}  // namespace funcs
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/funcs/blas/blaslt_impl.cu.h b/backends/metax_gpu/kernels/funcs/blas/blaslt_impl.cu.h
deleted file mode 100755
index d98182abef3..00000000000
--- a/backends/metax_gpu/kernels/funcs/blas/blaslt_impl.cu.h
+++ /dev/null
@@ -1,1137 +0,0 @@
-// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights
-// Reserved.
-/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060 && 0
-
-#include <cuda_runtime_api.h>  // NOLINT
-
-#include "cuda.h"  // NOLINT
-#include "glog/logging.h"
-// #include "paddle/phi/backends/dynload/cublasLt.h"
-#include "paddle/phi/backends/gpu/cuda/cuda_helper.h"
-#include "paddle/phi/common/amp_type_traits.h"
-#include "paddle/phi/common/memory_utils.h"
-#include "paddle/phi/core/flags.h"
-#include "paddle/phi/kernels/autotune/gpu_timer.h"
-#include "paddle/phi/kernels/autotune/switch_autotune.h"
-
-PHI_DECLARE_int64(cublaslt_exhaustive_search_times);
-#endif
-
-namespace phi {
-namespace funcs {
-
-#if (defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060 && 0)
-
-// Set this enum according to
-// https://docs.nvidia.com/cuda/cublas/index.html#cublasltepilogue-t
-// While kMatmul, kMatmulGrad, kMatmulGradWithoutBias share the same
-// enum value, but if all elements for MatmulPlanner->GetKey() is same,
-// no matter forward or backward, they could share the same descriptor
-// cache, in that the descriptor is for description of matmul operation.
-enum MatmulFusedType {
-  kMatmul = 0,
-  kMatmulGrad = 1,
-  kMatmulGradWithoutBias = 2,
-  kMatmulBias = 3,
-  kMatmulRelu = 4,
-  kMatmulBiasRelu = 5,
-  kMatmulBiasGelu = 6,
-  kMatmulBiasReluWithReservedData = 7,
-  kMatmulBiasGeluWithReservedData = 8,
-  kMatmulReluGrad = 9,
-  kMatmulGeluGrad = 10,
-  kMatmulBiasGradToA = 11,
-  kMatmulBiasGradToB = 12
-};
-
-static cublasLtEpilogue_t ConvertFusedType(MatmulFusedType fused_type) {
-  static std::map<MatmulFusedType, cublasLtEpilogue_t> fused_type_map = {
-      {MatmulFusedType::kMatmul, CUBLASLT_EPILOGUE_DEFAULT},
-      {MatmulFusedType::kMatmulGrad, CUBLASLT_EPILOGUE_DEFAULT},
-      {MatmulFusedType::kMatmulGradWithoutBias, CUBLASLT_EPILOGUE_DEFAULT},
-      {MatmulFusedType::kMatmulBias, CUBLASLT_EPILOGUE_BIAS},
-      {MatmulFusedType::kMatmulRelu, CUBLASLT_EPILOGUE_RELU},
-      {MatmulFusedType::kMatmulBiasRelu, CUBLASLT_EPILOGUE_RELU_BIAS},
-      {MatmulFusedType::kMatmulBiasGelu, CUBLASLT_EPILOGUE_GELU_BIAS},
-      {MatmulFusedType::kMatmulBiasReluWithReservedData,
-       CUBLASLT_EPILOGUE_RELU_AUX_BIAS},
-      {MatmulFusedType::kMatmulBiasGeluWithReservedData,
-       CUBLASLT_EPILOGUE_GELU_AUX_BIAS},
-      {MatmulFusedType::kMatmulReluGrad, CUBLASLT_EPILOGUE_DRELU},
-      {MatmulFusedType::kMatmulGeluGrad, CUBLASLT_EPILOGUE_DGELU},
-      {MatmulFusedType::kMatmulBiasGradToA, CUBLASLT_EPILOGUE_BGRADA},
-      {MatmulFusedType::kMatmulBiasGradToB, CUBLASLT_EPILOGUE_BGRADB}};
-
-  return fused_type_map[fused_type];
-}
-
-enum FusedGEMMGradInType { kDX = 0, kDY = 1, kDZ = 2 };
-
-template <bool TransX, bool TransY>
-struct FusedGEMMGradTrait;
-
-template <>
-struct FusedGEMMGradTrait<false, false> {
-  static constexpr auto kXGradA = FusedGEMMGradInType::kDZ;
-  static constexpr auto kXGradB = FusedGEMMGradInType::kDY;
-  static constexpr auto kXGradATrans = false;
-  static constexpr auto kXGradBTrans = true;
-
-  static constexpr auto kYGradA = FusedGEMMGradInType::kDX;
-  static constexpr auto kYGradB = FusedGEMMGradInType::kDZ;
-  static constexpr auto kYGradATrans = true;
-  static constexpr auto kYGradBTrans = false;
-};
-
-template <>
-struct FusedGEMMGradTrait<true, false> {
-  static constexpr auto kXGradA = FusedGEMMGradInType::kDY;
-  static constexpr auto kXGradB = FusedGEMMGradInType::kDZ;
-  static constexpr auto kXGradATrans = false;
-  static constexpr auto kXGradBTrans = true;
-
-  static constexpr auto kYGradA = FusedGEMMGradInType::kDX;
-  static constexpr auto kYGradB = FusedGEMMGradInType::kDZ;
-  static constexpr auto kYGradATrans = false;
-  static constexpr auto kYGradBTrans = false;
-};
-
-template <>
-struct FusedGEMMGradTrait<false, true> {
-  static constexpr auto kXGradA = FusedGEMMGradInType::kDZ;
-  static constexpr auto kXGradB = FusedGEMMGradInType::kDY;
-  static constexpr auto kXGradATrans = false;
-  static constexpr auto kXGradBTrans = false;
-
-  static constexpr auto kYGradA = FusedGEMMGradInType::kDZ;
-  static constexpr auto kYGradB = FusedGEMMGradInType::kDX;
-  static constexpr auto kYGradATrans = true;
-  static constexpr auto kYGradBTrans = false;
-};
-
-template <>
-struct FusedGEMMGradTrait<true, true> {
-  static constexpr auto kXGradA = FusedGEMMGradInType::kDY;
-  static constexpr auto kXGradB = FusedGEMMGradInType::kDZ;
-  static constexpr auto kXGradATrans = true;
-  static constexpr auto kXGradBTrans = true;
-
-  static constexpr auto kYGradA = FusedGEMMGradInType::kDZ;
-  static constexpr auto kYGradB = FusedGEMMGradInType::kDX;
-  static constexpr auto kYGradATrans = true;
-  static constexpr auto kYGradBTrans = true;
-};
-
-// To tell any matmul or fused matmul operation from each other.
-struct MatmulPlanner {
- public:
-  const void* bias{nullptr};
-  void* aux_data{nullptr};
-
-  MatmulPlanner() {}
-  MatmulPlanner(const std::vector<int64_t>& x_dims,
-                const std::vector<int64_t>& y_dims,
-                const bool trans_x,
-                const bool trans_y,
-                phi::DataType dtype,
-                MatmulFusedType fused_type,
-                const void* bias_data = nullptr,
-                void* reserve_data = nullptr,  // Commonly for ReLu bit-mask.
-                bool use_addto = false,
-                bool no_exchange = true)
-      : bias(bias_data), aux_data(reserve_data), fused_type_(fused_type) {
-    use_addto_ = use_addto;
-    key_ = phi::autotune::GenKey(x_dims,
-                                 y_dims,
-                                 static_cast<int>(trans_x),
-                                 static_cast<int>(trans_y),
-                                 static_cast<int>(dtype),
-                                 static_cast<int>(fused_type_),
-                                 static_cast<int>(use_addto_),
-                                 static_cast<int>(no_exchange));
-  }
-
-  bool UseAddTo() const { return use_addto_; }
-  size_t GetKey() const { return key_; }
-  MatmulFusedType GetFusedType() const { return fused_type_; }
-
-  size_t GenSubKey() const { return key_; }
-
- private:
-  MatmulFusedType fused_type_;
-  bool use_addto_;
-  size_t key_;
-};
-
-template <typename T>
-cublasComputeType_t GetCudaComputeType() {
-  if (std::is_same<T, double>::value) {
-    return CUBLAS_COMPUTE_64F;
-  } else if (std::is_same<T, int8_t>::value) {
-    return CUBLAS_COMPUTE_32I;
-  } else {
-    return CUBLAS_COMPUTE_32F;
-  }
-}
-
-struct MatmulDescriptor {
- public:
-  cublasLtMatmulDesc_t op_desc{nullptr};
-  cublasLtMatrixLayout_t x_desc{nullptr};
-  cublasLtMatrixLayout_t y_desc{nullptr};
-  cublasLtMatrixLayout_t out_desc{nullptr};
-  cublasLtMatmulAlgo_t* algo{nullptr};
-  bool is_cached{false};
-
-  MatmulDescriptor() {}
-  MatmulDescriptor(const MatmulDescriptor& obj) {
-    algo = obj.algo;
-    x_desc = obj.x_desc;
-    y_desc = obj.y_desc;
-    op_desc = obj.op_desc;
-    out_desc = obj.out_desc;
-    is_cached = obj.is_cached;
-  }
-
-  MatmulDescriptor& operator=(const MatmulDescriptor& obj) {
-    algo = obj.algo;
-    x_desc = obj.x_desc;
-    y_desc = obj.y_desc;
-    op_desc = obj.op_desc;
-    out_desc = obj.out_desc;
-    is_cached = obj.is_cached;
-
-    return *this;
-  }
-
-  ~MatmulDescriptor() PADDLE_MAY_THROW {
-    if (!is_cached) {
-      PADDLE_WARN_GPU_SUCCESS(dynload::cublasLtMatmulDescDestroy(op_desc));
-      PADDLE_WARN_GPU_SUCCESS(dynload::cublasLtMatrixLayoutDestroy(y_desc));
-      PADDLE_WARN_GPU_SUCCESS(dynload::cublasLtMatrixLayoutDestroy(x_desc));
-      PADDLE_WARN_GPU_SUCCESS(dynload::cublasLtMatrixLayoutDestroy(out_desc));
-      delete algo;
-
-      op_desc = nullptr;
-      x_desc = nullptr;
-      y_desc = nullptr;
-      out_desc = nullptr;
-      algo = nullptr;
-    }
-  }
-
-  // x_desc, y_desc, op_desc are allocated in heap memory.
-  template <typename T, typename DXT, typename DYT, bool TransX, bool TransY>
-  void Create(const int64_t M,
-              const int64_t N,
-              const int64_t K,
-              const bool trans_x,
-              const bool trans_y,
-              phi::funcs::MatmulPlanner* planner,
-              const int batch_size = 1,
-              const int64_t stride_x = 0,
-              const int64_t stride_y = 0,
-              const int64_t stride_out = 0,
-              bool grad_for_dx = true) {
-    using MT = typename phi::dtype::MPTypeTrait<T>::Type;
-    cudaDataType_t mat_type = phi::backends::gpu::ToCudaDataType<T>();
-    cudaDataType_t out_mat_type = phi::backends::gpu::ToCudaDataType<T>();
-    cudaDataType_t scale_type = phi::backends::gpu::ToCudaDataType<MT>();
-    cublasComputeType_t compute_type = GetCudaComputeType<T>();
-
-    if (std::is_same<T, int8_t>::value) {
-      out_mat_type = phi::backends::gpu::ToCudaDataType<int32_t>();
-      scale_type = phi::backends::gpu::ToCudaDataType<int32_t>();
-    }
-
-    // Create operation descriptor; see cublasLtMatmulDescAttributes_t for
-    // details about defaults; just need to set the transforms for A and B
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        dynload::cublasLtMatmulDescCreate(&op_desc, compute_type, scale_type));
-    SetFusedEpilogueOpDescriptor(planner, trans_x, trans_y, N);
-
-    // Create matrix descriptors
-    CreateMatrixLayout(&x_desc, mat_type, M, K, trans_x);
-    CreateMatrixLayout(&y_desc, mat_type, K, N, trans_y);
-    CreateMatrixLayout(&out_desc, out_mat_type, M, N, false);
-
-    // Config batch size and stride.
-    if (batch_size > 1) {
-      SetBatchAndStride(x_desc, batch_size, stride_x);
-      SetBatchAndStride(y_desc, batch_size, stride_y);
-      SetBatchAndStride(out_desc, batch_size, stride_out);
-    }
-  }
-
-  cublasLtMatmulAlgo_t* SetAlgo() {
-    // while entering this function, the desc shall be cached.
-    is_cached = true;
-    algo = new cublasLtMatmulAlgo_t;
-    return algo;
-  }
-
-  template <typename T>
-  void SetFusedEpiloguePtr(phi::funcs::MatmulPlanner* planner) {
-    if (planner->bias != nullptr) {
-      const T* bias_data = static_cast<const T*>(planner->bias);
-      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulDescSetAttribute(
-          op_desc,
-          CUBLASLT_MATMUL_DESC_BIAS_POINTER,
-          &bias_data,
-          sizeof(bias_data)));
-    }
-    if (planner->aux_data != nullptr) {
-      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulDescSetAttribute(
-          op_desc,
-          CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER,
-          &(planner->aux_data),
-          sizeof(planner->aux_data)));
-    }
-  }
-
-  std::string GetDescResultString(std::string prefix,
-                                  bool has_algo = true) const {
-    std::ostringstream out;
-    out << prefix << " \n";
-#define GET_DESC_DATA_STRING(src)                    \
-  do {                                               \
-    out << "  " << #src << " = [";                   \
-    int num = sizeof((*src)) / sizeof(src->data[0]); \
-    for (int i = 0; i < num; ++i) {                  \
-      if (i == 0) {                                  \
-        out << src->data[i];                         \
-      } else {                                       \
-        out << ", " << src->data[i];                 \
-      }                                              \
-    }                                                \
-    out << "]\n";                                    \
-  } while (0);
-
-    if (has_algo) {
-      GET_DESC_DATA_STRING(algo);
-    }
-    GET_DESC_DATA_STRING(x_desc);
-    GET_DESC_DATA_STRING(y_desc);
-    GET_DESC_DATA_STRING(out_desc);
-    GET_DESC_DATA_STRING(op_desc);
-#undef GET_DESC_DATA_STRING
-    return out.str();
-  }
-
-  void ExchangeXYDesc(bool no_exchange) {}
-
- protected:
-  void SetFusedEpilogueOpDescriptor(phi::funcs::MatmulPlanner* planner,
-                                    const bool trans_x,
-                                    const bool trans_y,
-                                    int64_t lead_dim) {
-    cublasOperation_t cublas_trans_x = trans_x ? CUBLAS_OP_T : CUBLAS_OP_N;
-    cublasOperation_t cublas_trans_y = trans_y ? CUBLAS_OP_T : CUBLAS_OP_N;
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        dynload::cublasLtMatmulDescSetAttribute(op_desc,
-                                                CUBLASLT_MATMUL_DESC_TRANSB,
-                                                &cublas_trans_x,
-                                                sizeof(cublas_trans_x)));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        dynload::cublasLtMatmulDescSetAttribute(op_desc,
-                                                CUBLASLT_MATMUL_DESC_TRANSA,
-                                                &cublas_trans_y,
-                                                sizeof(cublas_trans_y)));
-    MatmulFusedType fused_type = planner->GetFusedType();
-    if (fused_type != MatmulFusedType::kMatmul) {
-      cublasLtEpilogue_t cublaslt_fused_type = ConvertFusedType(fused_type);
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          dynload::cublasLtMatmulDescSetAttribute(op_desc,
-                                                  CUBLASLT_MATMUL_DESC_EPILOGUE,
-                                                  &cublaslt_fused_type,
-                                                  sizeof(fused_type)));
-    }
-    if (planner->aux_data) {
-      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulDescSetAttribute(
-          op_desc,
-          CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD,
-          &lead_dim,
-          sizeof(lead_dim)));
-    }
-  }
-
-  void CreateMatrixLayout(cublasLtMatrixLayout_t* desc,
-                          cudaDataType type,
-                          uint64_t rows,
-                          uint64_t cols,
-                          bool trans) {
-    if (trans) {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          dynload::cublasLtMatrixLayoutCreate(desc, type, rows, cols, rows));
-    } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          dynload::cublasLtMatrixLayoutCreate(desc, type, cols, rows, cols));
-    }
-  }
-
-  void SetBatchAndStride(cublasLtMatrixLayout_t desc,
-                         int batch_size,
-                         int64_t stride) {
-    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatrixLayoutSetAttribute(
-        desc,
-        CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT,
-        &batch_size,
-        sizeof(batch_size)));
-    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatrixLayoutSetAttribute(
-        desc,
-        CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
-        &stride,
-        sizeof(stride)));
-  }
-};
-
-struct MatmulGradDescriptor : MatmulDescriptor {
- public:
-  MatmulGradDescriptor() {}
-
-  template <typename T, typename DXT, typename DYT, bool TransX, bool TransY>
-  void Create(const int64_t M,
-              const int64_t N,
-              const int64_t K,
-              const bool trans_x,
-              const bool trans_y,
-              phi::funcs::MatmulPlanner* planner,
-              const int batch_size = 1,
-              int64_t stride_x = 0,
-              int64_t stride_y = 0,
-              int64_t stride_out = 0,
-              bool grad_for_dx = true) {
-    using MT = typename phi::dtype::MPTypeTrait<T>::Type;
-    cudaDataType_t mat_type = phi::backends::gpu::ToCudaDataType<T>();
-    cudaDataType_t scale_type = phi::backends::gpu::ToCudaDataType<MT>();
-    cublasComputeType_t compute_type = GetCudaComputeType<T>();
-
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        dynload::cublasLtMatmulDescCreate(&op_desc, compute_type, scale_type));
-    this->SetFusedEpilogueOpDescriptor(
-        planner, trans_x, trans_y, TransX ? M : K);
-
-    // Create operation desciriptor; see cublasLtMatmulDescAttributes_t for
-    // details about defaults; just need to set the transforms for A and B
-    this->CreateMatrixLayout(&x_desc, mat_type, N, M, true);
-    if (grad_for_dx) {
-      this->CreateMatrixLayout(&y_desc, mat_type, K, N, TransY);
-      this->CreateMatrixLayout(
-          &out_desc, phi::backends::gpu::ToCudaDataType<DXT>(), M, K, TransX);
-    } else {
-      this->CreateMatrixLayout(&y_desc, mat_type, M, K, TransX);
-      this->CreateMatrixLayout(
-          &out_desc, phi::backends::gpu::ToCudaDataType<DYT>(), K, N, TransY);
-    }
-  }
-
-  void ExchangeXYDesc(bool no_exchange) {
-    if (no_exchange) {
-      return;
-    }
-    auto* temp = y_desc;
-    y_desc = x_desc;
-    x_desc = temp;
-  }
-};
-
-template <typename T, typename OutT = T, class MatmulDescT = MatmulDescriptor>
-struct CublasLtBase {
- public:
-  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
-  static phi::Allocator::AllocationPtr GetWorkspace(const phi::GPUContext& ctx,
-                                                    size_t workspace_size) {
-    return phi::memory_utils::Alloc(
-        ctx.GetPlace(),
-        workspace_size,
-        phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
-  }
-
-  static void RunImpl(const phi::GPUContext& ctx,
-                      MatmulDescT* desc,
-                      const size_t sub_key,
-                      const T* x_ptr,
-                      const T* y_ptr,
-                      OutT* out_ptr,
-                      phi::funcs::MatmulPlanner* planner) {
-    MT alpha = static_cast<MT>(1);
-    MT beta = planner->UseAddTo() ? static_cast<MT>(1) : static_cast<MT>(0);
-    cublasLtHandle_t cublaslt_handle = ctx.cublaslt_handle();
-
-    // NOTE(limingshu): As workspace_size varies from different DL framework,
-    // I wonder is there any smarter idea for workspace setting, currently I
-    // just followed the settings from the NVIDIA colleague`s setting.
-    size_t workspace_size = static_cast<size_t>(4) * 1024 * 1024;
-    phi::Allocator::AllocationPtr workspace = GetWorkspace(ctx, workspace_size);
-
-    if (planner != nullptr) {
-      if (phi::autotune::AutoTuneStatus::Instance().UseAutoTune() &&
-          (!desc->is_cached)) {
-        SearchBestAlgo(ctx,
-                       cublaslt_handle,
-                       desc,
-                       static_cast<void*>(&alpha),
-                       static_cast<void*>(&beta),
-                       y_ptr,
-                       x_ptr,
-                       out_ptr,
-                       workspace->ptr(),
-                       workspace_size);
-        MatmulDescT* best_desc = new MatmulDescT(*desc);
-        VLOG(6) << best_desc->GetDescResultString(
-            "[Searched CublasltDescriptor] ");
-
-        auto& cache = phi::autotune::AutoTuneCache::Instance().GetMatmul();
-        cache.SetSubKey(sub_key, reinterpret_cast<void*>(best_desc));
-      }
-    }
-
-    VLOG(7) << desc->GetDescResultString("[Impl CublasltDescriptor] ");
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        dynload::cublasLtMatmul(cublaslt_handle,
-                                desc->op_desc,
-                                static_cast<void*>(&alpha),
-                                y_ptr,
-                                desc->y_desc,
-                                x_ptr,
-                                desc->x_desc,
-                                static_cast<void*>(&beta),
-                                out_ptr,
-                                desc->out_desc,
-                                out_ptr,
-                                desc->out_desc,
-                                desc->algo,
-                                workspace->ptr(),
-                                workspace_size,
-                                ctx.stream()));
-  }
-
-  static void SearchBestAlgo(const phi::GPUContext& ctx,
-                             const cublasLtHandle_t& lt_handle,
-                             MatmulDescT* desc,
-                             const void* alpha,
-                             const void* beta,
-                             const void* y_data,
-                             const void* x_data,
-                             void* out_data,
-                             void* workspace_ptr,
-                             size_t workspace_size) {
-    cublasLtMatmulPreference_t preference;
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        dynload::cublasLtMatmulPreferenceCreate(&preference));
-    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulPreferenceSetAttribute(
-        preference,
-        CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
-        &workspace_size,
-        sizeof(workspace_size)));
-
-    int returned_results = 0;
-    constexpr int requested_algo_count = 10;
-    std::vector<cublasLtMatmulHeuristicResult_t> heuristic_results(
-        requested_algo_count);
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        dynload::cublasLtMatmulAlgoGetHeuristic(lt_handle,
-                                                desc->op_desc,
-                                                desc->y_desc,
-                                                desc->x_desc,
-                                                desc->out_desc,
-                                                desc->out_desc,
-                                                preference,
-                                                requested_algo_count,
-                                                heuristic_results.data(),
-                                                &returned_results));
-    PADDLE_ENFORCE_GT(returned_results,
-                      0,
-                      phi::errors::Unavailable("No GEMM algorithm avaliable."));
-    int best_algo_idx = -1;
-    if (returned_results == 1 || FLAGS_cublaslt_exhaustive_search_times <= 0) {
-      best_algo_idx = 0;
-    } else {
-      float min_time_cost = std::numeric_limits<float>::max();
-      for (int algo_idx = 0; algo_idx < returned_results; ++algo_idx) {
-        float cur_time_cost =
-            RunAndMeasureAlgo(ctx,
-                              lt_handle,
-                              desc,
-                              alpha,
-                              beta,
-                              y_data,
-                              x_data,
-                              out_data,
-                              workspace_ptr,
-                              workspace_size,
-                              &(heuristic_results[algo_idx].algo));
-        VLOG(6) << "[MatmulWithCublaslt] algo[" << algo_idx
-                << "] time: " << cur_time_cost << " s";
-
-        if ((best_algo_idx == 0 && (1.05 * cur_time_cost < min_time_cost)) ||
-            (cur_time_cost < min_time_cost)) {
-          best_algo_idx = algo_idx;
-          min_time_cost = cur_time_cost;
-        }
-      }
-    }
-    VLOG(6) << "[MatmulWithCublaslt] best_algo_idx: " << best_algo_idx;
-
-    cublasLtMatmulAlgo_t* best_algo = desc->SetAlgo();
-    *best_algo = heuristic_results[best_algo_idx].algo;
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        dynload::cublasLtMatmulPreferenceDestroy(preference));
-  }
-
-  static float RunAndMeasureAlgo(const phi::GPUContext& ctx,
-                                 const cublasLtHandle_t& lt_handle,
-                                 MatmulDescT* desc,
-                                 const void* alpha,
-                                 const void* beta,
-                                 const void* y_data,
-                                 const void* x_data,
-                                 void* out_data,
-                                 void* workspace_ptr,
-                                 size_t workspace_size,
-                                 cublasLtMatmulAlgo_t* algo) {
-    int repeats = FLAGS_cublaslt_exhaustive_search_times;
-    if (repeats <= 0) {
-      return std::numeric_limits<float>::max();
-    }
-
-    phi::GpuTimer timer;
-    float time_cost = 0.f;
-    const auto& stream = ctx.stream();
-
-    for (int i = 0; i < repeats; ++i) {
-      timer.Start(stream);
-      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmul(lt_handle,
-                                                         desc->op_desc,
-                                                         alpha,
-                                                         y_data,
-                                                         desc->y_desc,
-                                                         x_data,
-                                                         desc->x_desc,
-                                                         beta,
-                                                         out_data,
-                                                         desc->out_desc,
-                                                         out_data,
-                                                         desc->out_desc,
-                                                         algo,
-                                                         workspace_ptr,
-                                                         workspace_size,
-                                                         stream));
-      timer.Stop(stream);
-      ctx.Wait();
-      auto time = timer.ElapsedTime();
-      if (i > 0) {
-        // Exclude the warmup runtime.
-        time_cost += time;
-      }
-    }
-    return (time_cost / (repeats - 1));
-  }
-};
-
-template <>
-struct CublasLtBase<int8_t, int32_t, MatmulDescriptor> {
- public:
-  static phi::Allocator::AllocationPtr GetWorkspace(const phi::GPUContext& ctx,
-                                                    size_t workspace_size) {
-    return phi::memory_utils::Alloc(
-        ctx.GetPlace(),
-        workspace_size,
-        phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
-  }
-
-  static void RunImpl(const phi::GPUContext& ctx,
-                      MatmulDescriptor* desc,
-                      const size_t sub_key,
-                      const int8_t* x_ptr,
-                      const int8_t* y_ptr,
-                      int32_t* out_ptr,
-                      phi::funcs::MatmulPlanner* planner) {
-    int32_t alpha = 1;
-    int32_t beta =
-        planner->UseAddTo() ? static_cast<int32_t>(1) : static_cast<int32_t>(0);
-    cublasLtHandle_t cublaslt_handle = ctx.cublaslt_handle();
-
-    size_t workspace_size = static_cast<size_t>(4) * 1024 * 1024;
-    phi::Allocator::AllocationPtr workspace = GetWorkspace(ctx, workspace_size);
-
-    if (planner != nullptr) {
-      if (phi::autotune::AutoTuneStatus::Instance().UseAutoTune() &&
-          (!desc->is_cached)) {
-        SearchBestAlgo(ctx,
-                       cublaslt_handle,
-                       desc,
-                       static_cast<void*>(&alpha),
-                       static_cast<void*>(&beta),
-                       y_ptr,
-                       x_ptr,
-                       out_ptr,
-                       workspace->ptr(),
-                       workspace_size);
-        MatmulDescriptor* best_desc = new MatmulDescriptor(*desc);
-        VLOG(6) << best_desc->GetDescResultString(
-            "[Searched CublasltDescriptor] ");
-
-        auto& cache = phi::autotune::AutoTuneCache::Instance().GetMatmul();
-        cache.SetSubKey(sub_key, reinterpret_cast<void*>(best_desc));
-      }
-    }
-
-    VLOG(7) << desc->GetDescResultString("[Impl CublasltDescriptor] ");
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        dynload::cublasLtMatmul(cublaslt_handle,
-                                desc->op_desc,
-                                static_cast<void*>(&alpha),
-                                y_ptr,
-                                desc->y_desc,
-                                x_ptr,
-                                desc->x_desc,
-                                static_cast<void*>(&beta),
-                                out_ptr,
-                                desc->out_desc,
-                                out_ptr,
-                                desc->out_desc,
-                                desc->algo,
-                                workspace->ptr(),
-                                workspace_size,
-                                ctx.stream()));
-  }
-
-  static void SearchBestAlgo(const phi::GPUContext& ctx,
-                             const cublasLtHandle_t& lt_handle,
-                             MatmulDescriptor* desc,
-                             const void* alpha,
-                             const void* beta,
-                             const void* y_data,
-                             const void* x_data,
-                             void* out_data,
-                             void* workspace_ptr,
-                             size_t workspace_size) {
-    cublasLtMatmulPreference_t preference;
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        dynload::cublasLtMatmulPreferenceCreate(&preference));
-    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulPreferenceSetAttribute(
-        preference,
-        CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
-        &workspace_size,
-        sizeof(workspace_size)));
-
-    int returned_results = 0;
-    constexpr int requested_algo_count = 10;
-    std::vector<cublasLtMatmulHeuristicResult_t> heuristic_results(
-        requested_algo_count);
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        dynload::cublasLtMatmulAlgoGetHeuristic(lt_handle,
-                                                desc->op_desc,
-                                                desc->y_desc,
-                                                desc->x_desc,
-                                                desc->out_desc,
-                                                desc->out_desc,
-                                                preference,
-                                                requested_algo_count,
-                                                heuristic_results.data(),
-                                                &returned_results));
-    PADDLE_ENFORCE_GT(returned_results,
-                      0,
-                      phi::errors::Unavailable("No GEMM algorithm avaliable."));
-    int best_algo_idx = -1;
-    if (returned_results == 1 || FLAGS_cublaslt_exhaustive_search_times <= 0) {
-      best_algo_idx = 0;
-    } else {
-      float min_time_cost = std::numeric_limits<float>::max();
-      for (int algo_idx = 0; algo_idx < returned_results; ++algo_idx) {
-        float cur_time_cost =
-            RunAndMeasureAlgo(ctx,
-                              lt_handle,
-                              desc,
-                              alpha,
-                              beta,
-                              y_data,
-                              x_data,
-                              out_data,
-                              workspace_ptr,
-                              workspace_size,
-                              &(heuristic_results[algo_idx].algo));
-        VLOG(6) << "[MatmulWithCublaslt] algo[" << algo_idx
-                << "] time: " << cur_time_cost << " s";
-
-        if ((best_algo_idx == 0 && (1.05 * cur_time_cost < min_time_cost)) ||
-            (cur_time_cost < min_time_cost)) {
-          best_algo_idx = algo_idx;
-          min_time_cost = cur_time_cost;
-        }
-      }
-    }
-    VLOG(6) << "[MatmulWithCublaslt] best_algo_idx: " << best_algo_idx;
-
-    cublasLtMatmulAlgo_t* best_algo = desc->SetAlgo();
-    *best_algo = heuristic_results[best_algo_idx].algo;
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        dynload::cublasLtMatmulPreferenceDestroy(preference));
-  }
-
-  static float RunAndMeasureAlgo(const phi::GPUContext& ctx,
-                                 const cublasLtHandle_t& lt_handle,
-                                 MatmulDescriptor* desc,
-                                 const void* alpha,
-                                 const void* beta,
-                                 const void* y_data,
-                                 const void* x_data,
-                                 void* out_data,
-                                 void* workspace_ptr,
-                                 size_t workspace_size,
-                                 cublasLtMatmulAlgo_t* algo) {
-    int repeats = FLAGS_cublaslt_exhaustive_search_times;
-    if (repeats <= 0) {
-      return std::numeric_limits<float>::max();
-    }
-
-    phi::GpuTimer timer;
-    float time_cost = 0.f;
-    const auto& stream = ctx.stream();
-
-    for (int i = 0; i < repeats; ++i) {
-      timer.Start(stream);
-      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmul(lt_handle,
-                                                         desc->op_desc,
-                                                         alpha,
-                                                         y_data,
-                                                         desc->y_desc,
-                                                         x_data,
-                                                         desc->x_desc,
-                                                         beta,
-                                                         out_data,
-                                                         desc->out_desc,
-                                                         out_data,
-                                                         desc->out_desc,
-                                                         algo,
-                                                         workspace_ptr,
-                                                         workspace_size,
-                                                         stream));
-      timer.Stop(stream);
-      ctx.Wait();
-      auto time = timer.ElapsedTime();
-      if (i > 0) {
-        // Exclude the warmup runtime.
-        time_cost += time;
-      }
-    }
-    return (time_cost / (repeats - 1));
-  }
-};
-
-// To judge if desc is cached or not.
-template <class DescT,
-          typename T,
-          typename DXT = T,
-          typename DYT = T,
-          bool TransX = false,
-          bool TransY = false>
-struct DescriptorSetter {
- public:
-  DescT desc;
-  size_t sub_key{std::numeric_limits<size_t>::min()};
-
-  DescriptorSetter(phi::funcs::MatmulPlanner* planner,
-                   const int64_t M,
-                   const int64_t N,
-                   const int64_t K,
-                   const bool trans_x,
-                   const bool trans_y,
-                   const int batch_size = 1,
-                   int64_t stride_x = 0,
-                   int64_t stride_y = 0,
-                   int64_t stride_out = 0,
-                   const bool no_exchange = true,
-                   bool grad_for_dx = true) {
-    if (std::is_same<T, int8_t>::value) {
-      if (!trans_x && !trans_y) {
-        PADDLE_ENFORCE_EQ(
-            (N % 4 == 0 || N == 1),
-            true,
-            phi::errors::InvalidArgument(
-                "The dimension size N used in int8 matmul must be 1 or a "
-                "multiple of 4 does not "
-                "match the size (%d) currently contained in the container.",
-                N));
-        PADDLE_ENFORCE_EQ(
-            (K % 4 == 0),
-            true,
-            phi::errors::InvalidArgument(
-                "The dimension size K used in int8 matmul must be a multiple "
-                "of 4 does not "
-                "match the size (%d) currently contained in the container.",
-                K));
-      } else if (!trans_x && trans_y) {
-        PADDLE_ENFORCE_EQ(
-            (K % 4 == 0),
-            true,
-            phi::errors::InvalidArgument(
-                "The dimension size K used in int8 matmul must be a multiple "
-                "of 4 does not "
-                "match the size (%d) currently contained in the container.",
-                K));
-      } else if (trans_x && !trans_y) {
-        PADDLE_ENFORCE_EQ(
-            (M % 4 == 0 || M == 1),
-            true,
-            phi::errors::InvalidArgument(
-                "The dimension size M used in int8 matmul must be 1 or a "
-                "multiple of 4 does not "
-                "match the size (%d) currently contained in the container.",
-                M));
-        PADDLE_ENFORCE_EQ(
-            (N % 4 == 0 || N == 1),
-            true,
-            phi::errors::InvalidArgument(
-                "The dimension size N used in int8 matmul must be 1 or a "
-                "multiple of 4 does not "
-                "match the size (%d) currently contained in the container.",
-                N));
-      } else {
-        PADDLE_ENFORCE_EQ(
-            (M % 4 == 0 || M == 1),
-            true,
-            phi::errors::InvalidArgument(
-                "The dimension size M used in int8 matmul must be 1 or a "
-                "multiple of 4 does not "
-                "match the size (%d) currently contained in the container.",
-                M));
-        PADDLE_ENFORCE_EQ(
-            (K % 4 == 0),
-            true,
-            phi::errors::InvalidArgument(
-                "The dimension size K used in int8 matmul must be a multiple "
-                "of 4 does not "
-                "match the size (%d) currently contained in the container.",
-                K));
-      }
-    }
-
-    if (planner != nullptr) {
-      sub_key = planner->GenSubKey();
-    }
-
-    auto& mamtul_cache = phi::autotune::AutoTuneCache::Instance().GetMatmul();
-    if (mamtul_cache.FindSubKey(sub_key)) {
-      desc = *(reinterpret_cast<DescT*>(mamtul_cache.GetSubKey(sub_key)));
-      desc.template SetFusedEpiloguePtr<DYT>(planner);
-      VLOG(7) << desc.GetDescResultString("[Heap CublasltDescriptor] ");
-    } else {
-      desc.template Create<T, DXT, DYT, TransX, TransY>(M,
-                                                        N,
-                                                        K,
-                                                        trans_x,
-                                                        trans_y,
-                                                        planner,
-                                                        batch_size,
-                                                        stride_x,
-                                                        stride_y,
-                                                        stride_out,
-                                                        grad_for_dx);
-      desc.ExchangeXYDesc(no_exchange);
-      if (planner != nullptr) {
-        desc.template SetFusedEpiloguePtr<DYT>(planner);
-      }
-      VLOG(7) << desc.GetDescResultString("[Stack CublasltDescriptor] ", false);
-    }
-  }
-};
-
-// For matmul with kernels autotune
-template <typename T, typename OutT = T>
-struct MatmulWithCublasLt : public CublasLtBase<T, OutT> {
- public:
-  static void Run(const phi::GPUContext& ctx,
-                  const T* x_data,
-                  const T* y_data,
-                  OutT* out_data,
-                  const int64_t M,
-                  const int64_t N,
-                  const int64_t K,
-                  const bool trans_x,
-                  const bool trans_y,
-                  phi::funcs::MatmulPlanner* planner = nullptr) {
-    auto setter = DescriptorSetter<MatmulDescriptor, T>(
-        planner, M, N, K, trans_x, trans_y);
-    CublasLtBase<T, OutT>::RunImpl(
-        ctx, &setter.desc, setter.sub_key, x_data, y_data, out_data, planner);
-  }
-
-  static void RunWithBatch(const phi::GPUContext& ctx,
-                           const T* x_data,
-                           const T* y_data,
-                           OutT* out_data,
-                           const int64_t M,
-                           const int64_t N,
-                           const int64_t K,
-                           bool trans_x,
-                           bool trans_y,
-                           int batch_size,
-                           int64_t stride_x,
-                           int64_t stride_y,
-                           int64_t stride_out,
-                           phi::funcs::MatmulPlanner* planner = nullptr) {
-    auto setter = DescriptorSetter<MatmulDescriptor, T>(planner,
-                                                        M,
-                                                        N,
-                                                        K,
-                                                        trans_x,
-                                                        trans_y,
-                                                        batch_size,
-                                                        stride_x,
-                                                        stride_y,
-                                                        stride_out);
-    CublasLtBase<T, OutT>::RunImpl(
-        ctx, &setter.desc, setter.sub_key, x_data, y_data, out_data, planner);
-  }
-
-  static void RunWithBatch(const phi::GPUContext& ctx,
-                           const T** x_data,
-                           const T** y_data,
-                           OutT** out_data,
-                           const int64_t M,
-                           const int64_t N,
-                           const int64_t K,
-                           bool trans_x,
-                           bool trans_y,
-                           int batch_size,
-                           phi::funcs::MatmulPlanner* planner = nullptr) {
-    for (int i = 0; i < batch_size; ++i) {
-      Run(ctx,
-          x_data[i],
-          y_data[i],
-          out_data[i],
-          M,
-          N,
-          K,
-          trans_x,
-          trans_y,
-          planner);
-    }
-  }
-};
-
-// As for just Linear fused ephilogue below: out = matmul(x, y) + bias.
-template <typename T>
-struct LinearWithCublasLt : public CublasLtBase<T> {
-  static void Run(const phi::GPUContext& ctx,
-                  const phi::DenseTensor* x,
-                  const phi::DenseTensor* y,
-                  phi::DenseTensor* out,
-                  const void* bias_data,
-                  void* reserve_data,
-                  const int64_t M,
-                  const int64_t N,
-                  const int64_t K,
-                  const bool trans_x,
-                  const bool trans_y,
-                  const MatmulFusedType fused_type) {
-    auto planner = phi::funcs::MatmulPlanner(common::vectorize(x->dims()),
-                                             common::vectorize(y->dims()),
-                                             trans_x,
-                                             trans_y,
-                                             phi::CppTypeToDataType<T>::Type(),
-                                             fused_type,
-                                             bias_data,
-                                             reserve_data);
-    auto setter = DescriptorSetter<MatmulDescriptor, T>(
-        &planner, M, N, K, trans_x, trans_y);
-    CublasLtBase<T>::RunImpl(ctx,
-                             &setter.desc,
-                             setter.sub_key,
-                             x->data<T>(),
-                             y->data<T>(),
-                             out->data<T>(),
-                             &planner);
-  }
-};
-
-template <typename T, typename DXT, typename DYT, bool TransX, bool TransY>
-struct LinearGradWithCublasLt : public CublasLtBase<T> {
-  static void Run(
-      const phi::GPUContext& ctx,
-      const phi::DenseTensor* x,
-      const phi::DenseTensor* y,
-      phi::DenseTensor* out,
-      const void* bias_data,
-      void* reserve_data,
-      const int64_t M,
-      const int64_t N,
-      const int64_t K,
-      const MatmulFusedType fused_type,
-      const bool trans_x,
-      const bool trans_y,
-      const bool use_addto,
-      const bool no_exchange,  // exchange x_desc and y_desc for grad.
-      bool grad_for_dx = true) {
-    auto planner = phi::funcs::MatmulPlanner(common::vectorize(x->dims()),
-                                             common::vectorize(y->dims()),
-                                             trans_x,
-                                             trans_y,
-                                             phi::CppTypeToDataType<T>::Type(),
-                                             fused_type,
-                                             bias_data,
-                                             reserve_data,
-                                             use_addto,
-                                             no_exchange);
-    auto setter =
-        DescriptorSetter<MatmulGradDescriptor, T, DXT, DYT, TransX, TransY>(
-            &planner,
-            M,
-            N,
-            K,
-            trans_x,
-            trans_y,
-            /*batch_size=*/1,
-            /*stride_x=*/0,
-            /*stride_y=*/0,
-            /*stride_out=*/0,
-            /*exchange_x_y_desc=*/no_exchange,
-            /*grad_for_dx=*/grad_for_dx);
-
-    // To setting data type for different kinda out_data.
-    if (grad_for_dx) {
-      CublasLtBase<T, DXT, MatmulGradDescriptor>::RunImpl(
-          ctx,
-          &setter.desc,
-          setter.sub_key,
-          no_exchange ? x->data<T>() : y->data<T>(),
-          no_exchange ? y->data<T>() : x->data<T>(),
-          out->data<DXT>(),
-          &planner);
-    } else {
-      CublasLtBase<T, DYT, MatmulGradDescriptor>::RunImpl(
-          ctx,
-          &setter.desc,
-          setter.sub_key,
-          no_exchange ? x->data<T>() : y->data<T>(),
-          no_exchange ? y->data<T>() : x->data<T>(),
-          out->data<DYT>(),
-          &planner);
-    }
-  }
-};
-#else
-// A void structure just for successfully compile.
-struct MatmulPlanner {};
-#endif  // (PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060 && 0
-
-}  // namespace funcs
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/funcs/blas/cublas.cc b/backends/metax_gpu/kernels/funcs/blas/cublas.cc
deleted file mode 100644
index 77a0cced00b..00000000000
--- a/backends/metax_gpu/kernels/funcs/blas/cublas.cc
+++ /dev/null
@@ -1,40 +0,0 @@
-// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights
-// Reserved.
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "cublas.h"  // NOLINT
-
-namespace phi {
-namespace dynload {
-std::once_flag cublas_dso_flag;
-void *cublas_dso_handle = nullptr;
-
-#define DEFINE_WRAP(__name) DynLoad__##__name __name
-
-CUBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP);
-
-#ifdef CUBLAS_BLAS_ROUTINE_EACH_R2
-CUBLAS_BLAS_ROUTINE_EACH_R2(DEFINE_WRAP);
-#endif
-
-#ifdef CUBLAS_BLAS_ROUTINE_EACH_R3
-CUBLAS_BLAS_ROUTINE_EACH_R3(DEFINE_WRAP);
-#endif
-
-#ifdef CUBLAS_BLAS_ROUTINE_EACH_R4
-CUBLAS_BLAS_ROUTINE_EACH_R4(DEFINE_WRAP);
-#endif
-}  // namespace dynload
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/funcs/blas/cublas.h b/backends/metax_gpu/kernels/funcs/blas/cublas.h
deleted file mode 100755
index 776c7a1723b..00000000000
--- a/backends/metax_gpu/kernels/funcs/blas/cublas.h
+++ /dev/null
@@ -1,148 +0,0 @@
-// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights
-// Reserved.
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-// clang-format off
-#pragma once
-#include <cublas_v2.h>
-#include <cuda.h>
-
-#include <mutex>  // NOLINT
-#include <type_traits>
-
-#include "kernels/dynload/dynamic_loader.h"
-#include "./port.h" // NOLINT
-// clang-format on
-namespace phi {
-namespace dynload {
-
-extern std::once_flag cublas_dso_flag;
-extern void* cublas_dso_handle;
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load cublas routine
- * via operator overloading.
- *
- * note: default dynamic linked libs
- */
-#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)                            \
-  struct DynLoad__##__name {                                                \
-    template <typename... Args>                                             \
-    inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
-      using cublas_func =                                                   \
-          decltype(::__name(std::declval<Args>()...)) (*)(Args...);         \
-      std::call_once(cublas_dso_flag, []() {                                \
-        cublas_dso_handle = phi::dynload::GetCublasDsoHandle();             \
-      });                                                                   \
-      std::string replaced_name = #__name;                                  \
-      replaced_name = replaced_name.replace(0, 2, "mc");                    \
-      int index = replaced_name.find("_", 0);                               \
-      if (index != -1) replaced_name = replaced_name.substr(0, index);      \
-      static void* p_##__name =                                             \
-          dlsym(cublas_dso_handle, replaced_name.c_str());                  \
-      return reinterpret_cast<cublas_func>(p_##__name)(args...);            \
-    }                                                                       \
-  };                                                                        \
-  extern DynLoad__##__name __name
-
-#define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
-  __macro(cublasSaxpy_v2);                \
-  __macro(cublasDaxpy_v2);                \
-  __macro(cublasCaxpy_v2);                \
-  __macro(cublasZaxpy_v2);                \
-  __macro(cublasSscal_v2);                \
-  __macro(cublasDscal_v2);                \
-  __macro(cublasScopy_v2);                \
-  __macro(cublasDcopy_v2);                \
-  __macro(cublasSgemv_v2);                \
-  __macro(cublasDgemv_v2);                \
-  __macro(cublasCgemv_v2);                \
-  __macro(cublasZgemv_v2);                \
-  __macro(cublasSgemm_v2);                \
-  __macro(cublasDgemm_v2);                \
-  __macro(cublasCgemm_v2);                \
-  __macro(cublasZgemm_v2);                \
-  __macro(cublasHgemm);                   \
-  __macro(cublasSgemmEx);                 \
-  __macro(cublasSgeam);                   \
-  __macro(cublasDgeam);                   \
-  __macro(cublasStrsm_v2);                \
-  __macro(cublasDtrsm_v2);                \
-  __macro(cublasCtrsm_v2);                \
-  __macro(cublasZtrsm_v2);                \
-  __macro(cublasCreate_v2);               \
-  __macro(cublasDestroy_v2);              \
-  __macro(cublasSetStream_v2);            \
-  __macro(cublasSetPointerMode_v2);       \
-  __macro(cublasGetPointerMode_v2);       \
-  __macro(cublasSgemmBatched);            \
-  __macro(cublasDgemmBatched);            \
-  __macro(cublasCgemmBatched);            \
-  __macro(cublasZgemmBatched);            \
-  __macro(cublasStrsmBatched);            \
-  __macro(cublasDtrsmBatched);            \
-  __macro(cublasCtrsmBatched);            \
-  __macro(cublasZtrsmBatched);            \
-  __macro(cublasSgetrfBatched);           \
-  __macro(cublasSgetriBatched);           \
-  __macro(cublasDgetrfBatched);           \
-  __macro(cublasDgetriBatched);           \
-  __macro(cublasSmatinvBatched);          \
-  __macro(cublasDmatinvBatched);          \
-  __macro(cublasSgetrsBatched);           \
-  __macro(cublasDgetrsBatched);           \
-  __macro(cublasCgetrfBatched);           \
-  __macro(cublasCgetriBatched);           \
-  __macro(cublasCmatinvBatched);          \
-  __macro(cublasZgetrfBatched);           \
-  __macro(cublasZgetriBatched);           \
-  __macro(cublasZmatinvBatched);
-
-CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
-
-// APIs available after CUDA 8.0
-#if CUDA_VERSION >= 8000
-#define CUBLAS_BLAS_ROUTINE_EACH_R2(__macro) \
-  __macro(cublasGemmEx);                     \
-  __macro(cublasSgemmStridedBatched);        \
-  __macro(cublasDgemmStridedBatched);        \
-  __macro(cublasCgemmStridedBatched);        \
-  __macro(cublasZgemmStridedBatched);        \
-  __macro(cublasHgemmStridedBatched);
-
-CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
-#endif
-
-// APIs available after CUDA 9.0
-#if CUDA_VERSION >= 9000
-#define CUBLAS_BLAS_ROUTINE_EACH_R3(__macro) \
-  __macro(cublasSetMathMode);                \
-  __macro(cublasGetMathMode);
-
-CUBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
-#endif
-
-// APIs available after CUDA 9.1
-#if CUDA_VERSION >= 9010
-#define CUBLAS_BLAS_ROUTINE_EACH_R4(__macro) \
-  __macro(cublasGemmBatchedEx);              \
-  __macro(cublasGemmStridedBatchedEx);
-
-CUBLAS_BLAS_ROUTINE_EACH_R4(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
-#endif
-
-#undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
-}  // namespace dynload
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/funcs/blas/cublasLt.cc b/backends/metax_gpu/kernels/funcs/blas/cublasLt.cc
deleted file mode 100644
index 776f7fdd812..00000000000
--- a/backends/metax_gpu/kernels/funcs/blas/cublasLt.cc
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "cublasLt.h"
-
-namespace phi {
-namespace dynload {
-std::once_flag cublasLt_dso_flag;
-void *cublasLt_dso_handle = nullptr;
-
-#define DEFINE_WRAP(__name) DynLoad__##__name __name
-
-CUBLASLT_BLAS_ROUTINE_EACH(DEFINE_WRAP);
-
-}  // namespace dynload
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/funcs/blas/cublasLt.h b/backends/metax_gpu/kernels/funcs/blas/cublasLt.h
deleted file mode 100644
index 2f8a929dd0c..00000000000
--- a/backends/metax_gpu/kernels/funcs/blas/cublasLt.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cublasLt.h>
-#include <cuda.h>
-
-#include <mutex>  // NOLINT
-#include <type_traits>
-
-#include "./port.h"
-#include "kernels/dynload/dynamic_loader.h"
-
-namespace phi {
-namespace dynload {
-
-extern std::once_flag cublasLt_dso_flag;
-extern void* cublasLt_dso_handle;
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load cublasLt routine
- * via operator overloading.
- *
- * note: default dynamic linked libs
- */
-#define DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP(__name)                          \
-  struct DynLoad__##__name {                                                \
-    template <typename... Args>                                             \
-    inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
-      using cublasLt_func =                                                 \
-          decltype(::__name(std::declval<Args>()...)) (*)(Args...);         \
-      std::call_once(cublasLt_dso_flag, []() {                              \
-        cublasLt_dso_handle = phi::dynload::GetCublasLtDsoHandle();         \
-      });                                                                   \
-      std::string replaced_name = #__name;                                  \
-      replaced_name = replaced_name.replace(0, 2, "mc");                    \
-      static void* p_##__name =                                             \
-          dlsym(cublasLt_dso_handle, replaced_name.c_str());                \
-      return reinterpret_cast<cublasLt_func>(p_##__name)(args...);          \
-    }                                                                       \
-  };                                                                        \
-  extern DynLoad__##__name __name
-
-// APIs available after CUDA 11.1
-#if CUDA_VERSION >= 11010
-#define CUBLASLT_BLAS_ROUTINE_EACH(__macro)         \
-  __macro(cublasLtCreate);                          \
-  __macro(cublasLtDestroy);                         \
-  __macro(cublasLtMatmul);                          \
-  __macro(cublasLtMatmulDescCreate);                \
-  __macro(cublasLtMatmulDescDestroy);               \
-  __macro(cublasLtMatmulDescSetAttribute);          \
-  __macro(cublasLtMatmulDescGetAttribute);          \
-  __macro(cublasLtMatrixLayoutCreate);              \
-  __macro(cublasLtMatrixLayoutDestroy);             \
-  __macro(cublasLtMatrixLayoutSetAttribute);        \
-  __macro(cublasLtMatrixLayoutGetAttribute);        \
-  __macro(cublasLtMatmulPreferenceCreate);          \
-  __macro(cublasLtMatmulPreferenceDestroy);         \
-  __macro(cublasLtMatmulPreferenceSetAttribute);    \
-  __macro(cublasLtMatmulAlgoGetHeuristic);          \
-  __macro(cublasLtMatrixTransform);                 \
-  __macro(cublasLtMatrixTransformDescCreate);       \
-  __macro(cublasLtMatrixTransformDescDestroy);      \
-  __macro(cublasLtMatrixTransformDescSetAttribute); \
-  __macro(cublasLtMatmulAlgoInit);                  \
-  __macro(cublasLtMatmulAlgoConfigSetAttribute);    \
-  __macro(cublasLtMatmulAlgoConfigGetAttribute);    \
-  __macro(cublasLtMatmulAlgoGetIds);                \
-  __macro(cublasLtMatmulAlgoCapGetAttribute);       \
-  __macro(cublasLtMatmulAlgoCheck);
-// __macro(cublasLtGetCudartVersion);
-#else
-#define CUBLASLT_BLAS_ROUTINE_EACH(__macro)      \
-  __macro(cublasLtCreate);                       \
-  __macro(cublasLtDestroy);                      \
-  __macro(cublasLtMatmul);                       \
-  __macro(cublasLtMatmulDescCreate);             \
-  __macro(cublasLtMatmulDescDestroy);            \
-  __macro(cublasLtMatmulDescSetAttribute);       \
-  __macro(cublasLtMatmulDescGetAttribute);       \
-  __macro(cublasLtMatrixLayoutCreate);           \
-  __macro(cublasLtMatrixLayoutDestroy);          \
-  __macro(cublasLtMatrixLayoutSetAttribute);     \
-  __macro(cublasLtMatrixLayoutGetAttribute);     \
-  __macro(cublasLtMatmulPreferenceCreate);       \
-  __macro(cublasLtMatmulPreferenceDestroy);      \
-  __macro(cublasLtMatmulPreferenceSetAttribute); \
-  __macro(cublasLtMatmulAlgoGetHeuristic);       \
-  __macro(cublasLtMatrixTransform);              \
-  __macro(cublasLtMatrixTransformDescCreate);    \
-  __macro(cublasLtMatrixTransformDescDestroy);   \
-  __macro(cublasLtMatrixTransformDescSetAttribute);
-#endif
-
-CUBLASLT_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP)
-// #endif
-
-#undef DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP
-}  // namespace dynload
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/funcs/blas/cublaslt.h b/backends/metax_gpu/kernels/funcs/blas/cublaslt.h
deleted file mode 100755
index 24505567baf..00000000000
--- a/backends/metax_gpu/kernels/funcs/blas/cublaslt.h
+++ /dev/null
@@ -1,328 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <sstream>
-#include <string>
-#include <unordered_map>
-
-#include "./cublasLt.h"
-#include "paddle/phi/common/float8_e4m3fn.h"
-#include "paddle/phi/core/dense_tensor.h"
-
-namespace dyl = phi::dynload;
-
-namespace phi {
-
-struct CublasLtAlgoParam {
-  int algoId;
-  int swizzle;
-  int customOption;
-  int tile;
-  int splitK_val;
-  int reductionScheme;
-  int stages;
-  size_t workspace_size;
-};
-
-const std::map<std::tuple<int, int, int>, CublasLtAlgoParam> AlgoParamCache{};
-
-class CublasLtHelper {
- public:
-  CublasLtHelper(int m, int k, int n, cublasLtHandle_t handle)
-      : handle_(handle), alpha_(1), beta_(0), m_(m), k_(k), n_(n) {
-    cublasStatus_t status;
-
-    cublasComputeType_t cudaComputeType = CUBLAS_COMPUTE_32I;
-
-    // matmul desc
-    status = dyl::cublasLtMatmulDescCreate(
-        &matmul_desc_, cudaComputeType, CUDA_R_32I);
-
-    PADDLE_ENFORCE_EQ(
-        status,
-        CUBLAS_STATUS_SUCCESS,
-        common::errors::External(
-            "cublasLtMatmulDescCreate execution error"
-            "refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
-            "information"));
-    cublasOperation_t op_transpose = CUBLAS_OP_T;
-    status = dyl::cublasLtMatmulDescSetAttribute(matmul_desc_,
-                                                 CUBLASLT_MATMUL_DESC_TRANSA,
-                                                 &op_transpose,
-                                                 sizeof(op_transpose));
-    PADDLE_ENFORCE_EQ(
-        status,
-        CUBLAS_STATUS_SUCCESS,
-        common::errors::External(
-            "cublasLtMatmulDescSetAttribute execution error"
-            "refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
-            "information"));
-
-    // matrix desc
-    status = dyl::cublasLtMatrixLayoutCreate(&B_desc_, CUDA_R_8I, k, n, k);
-    PADDLE_ENFORCE_EQ(
-        status,
-        CUBLAS_STATUS_SUCCESS,
-        common::errors::External(
-            "cublasLtMatrixLayoutCreate execution error"
-            "refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
-            "information"));
-
-    status = dyl::cublasLtMatrixLayoutCreate(&A_desc_, CUDA_R_8I, k, m, k);
-    PADDLE_ENFORCE_EQ(
-        status,
-        CUBLAS_STATUS_SUCCESS,
-        common::errors::External(
-            "cublasLtMatrixLayoutCreate execution error"
-            "refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
-            "information"));
-
-    status = dyl::cublasLtMatrixLayoutCreate(&C_desc_, CUDA_R_32I, n, m, n);
-    PADDLE_ENFORCE_EQ(
-        status,
-        CUBLAS_STATUS_SUCCESS,
-        common::errors::External(
-            "cublasLtMatrixLayoutCreate execution error"
-            "refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
-            "information"));
-
-#if CUDA_VERSION >= 11020
-
-    int algoId = 21;
-    int swizzle = 0;
-    int customOption = 0;
-    int tile = 15;
-    int splitK_val = 0;
-    int reductionScheme = 0;
-    int stages = 23;
-    workspace_size_ = 0;
-    if (m >= 128) {
-      tile = 20;
-      stages = 17;
-    }
-
-    std::tuple<int, int, int> key(m_, k_, n_);
-    if (AlgoParamCache.count(key) != 0) {
-      auto value = AlgoParamCache.at(key);
-      algoId = value.algoId;
-      swizzle = value.swizzle;
-      customOption = value.customOption;
-      tile = value.tile;
-      splitK_val = value.splitK_val;
-      reductionScheme = value.reductionScheme;
-      stages = value.stages;
-      workspace_size_ = value.workspace_size;
-    }
-
-    dyl::cublasLtMatmulAlgoInit(handle_,
-                                cudaComputeType,
-                                CUDA_R_32I,
-                                CUDA_R_8I,
-                                CUDA_R_8I,
-                                CUDA_R_32I,
-                                CUDA_R_32I,
-                                algoId,
-                                &algo_);
-    dyl::cublasLtMatmulAlgoConfigSetAttribute(
-        &algo_,
-        CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION,
-        &(customOption),
-        sizeof(customOption));
-    dyl::cublasLtMatmulAlgoConfigSetAttribute(
-        &algo_, CUBLASLT_ALGO_CONFIG_TILE_ID, &(tile), sizeof(tile));
-    dyl::cublasLtMatmulAlgoConfigSetAttribute(&algo_,
-                                              CUBLASLT_ALGO_CONFIG_SPLITK_NUM,
-                                              &(splitK_val),
-                                              sizeof(splitK_val));
-    dyl::cublasLtMatmulAlgoConfigSetAttribute(
-        &algo_,
-        CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING,
-        &(swizzle),
-        sizeof(swizzle));
-    dyl::cublasLtMatmulAlgoConfigSetAttribute(
-        &algo_,
-        CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
-        &(reductionScheme),
-        sizeof(int));
-#if CUDA_VERSION >= 11000
-    dyl::cublasLtMatmulAlgoConfigSetAttribute(
-        &algo_, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(stages), sizeof(stages));
-#endif
-#endif
-  }
-  ~CublasLtHelper() {}
-
-  void GEMM(const int8_t* A_dev,
-            const int8_t* B_dev,
-            int32_t* C_dev,
-            cudaStream_t stream,
-            void* workspace = nullptr) {
-    cublasStatus_t status;
-
-    status = dyl::cublasLtMatmul(handle_,
-                                 matmul_desc_,
-                                 &alpha_,
-                                 B_dev,
-                                 B_desc_,
-                                 A_dev,
-                                 A_desc_,
-                                 &beta_,
-                                 C_dev,
-                                 C_desc_,
-                                 C_dev,
-                                 C_desc_,
-#if CUDA_VERSION >= 11020
-                                 &algo_,
-                                 workspace,
-                                 workspace_size_,
-#else
-                                 nullptr,
-                                 nullptr,
-                                 0,
-#endif
-                                 stream);
-    PADDLE_ENFORCE_EQ(
-        status,
-        CUBLAS_STATUS_SUCCESS,
-        common::errors::External(
-            "cublasLtMatmul execution error"
-            "refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
-            "information"));
-  }
-
- private:
-  cublasLtHandle_t handle_;
-  cublasLtMatmulDesc_t matmul_desc_;
-  cublasLtMatrixLayout_t A_desc_;
-  cublasLtMatrixLayout_t B_desc_;
-  cublasLtMatrixLayout_t C_desc_;
-
-  cublasLtMatmulAlgo_t algo_;
-
-  int32_t alpha_ = 1;
-  int32_t beta_ = 0;
-
-  int m_ = 0;
-  int k_ = 0;
-  int n_ = 0;
-
-  size_t workspace_size_ = 0;
-};
-
-template <typename T>
-inline cudaDataType_t GetCublasLtDataType() {
-  return CUDA_R_32F;
-}
-
-template <>
-inline cudaDataType_t GetCublasLtDataType<phi::dtype::float16>() {
-  return CUDA_R_16F;
-}
-
-template <>
-inline cudaDataType_t GetCublasLtDataType<phi::dtype::bfloat16>() {
-  return CUDA_R_16BF;
-}
-
-#if CUDA_VERSION >= 12010
-template <typename T>
-void CublasLtMatmulFP8(const phi::GPUContext& dev_ctx,
-                       const phi::DenseTensor& mat_a,
-                       const phi::DenseTensor& mat_b,
-                       phi::DenseTensor* workspace,
-                       phi::DenseTensor* out) {
-  int m = mat_a.dims()[0];
-  int k = mat_a.dims()[1];
-  int n = mat_b.dims()[1];
-
-  // init data structure
-  cublasStatus_t status;
-  auto A_type = CUDA_R_8F_E4M3;
-  auto B_type = CUDA_R_8F_E4M3;
-  auto C_type = GetCublasLtDataType<T>();
-
-  cublasLtMatmulDesc_t matmul_desc_;
-  cublasLtMatrixLayout_t A_desc_;
-  cublasLtMatrixLayout_t B_desc_;
-  cublasLtMatrixLayout_t C_desc_;
-  float alpha_ = 1.0f;
-  float beta_ = 0.0f;
-
-  cublasComputeType_t cudaComputeType = CUBLAS_COMPUTE_32F;
-  status =
-      dyl::cublasLtMatmulDescCreate(&matmul_desc_, cudaComputeType, CUDA_R_32F);
-  cublasOperation_t op_transpose = CUBLAS_OP_T;
-  status = dyl::cublasLtMatmulDescSetAttribute(matmul_desc_,
-                                               CUBLASLT_MATMUL_DESC_TRANSA,
-                                               &op_transpose,
-                                               sizeof(op_transpose));
-  status = dyl::cublasLtMatrixLayoutCreate(&B_desc_, B_type, k, n, k);
-  status = dyl::cublasLtMatrixLayoutCreate(&A_desc_, A_type, k, m, k);
-  status = dyl::cublasLtMatrixLayoutCreate(&C_desc_, C_type, n, m, n);
-
-  // Need to use heuristic
-  int returnedResults = 0;
-  cublasLtMatmulHeuristicResult_t heuristicResult = {};
-  cublasLtMatmulPreference_t preference = NULL;
-  size_t work_space_size = workspace->numel();
-
-  status = dyl::cublasLtMatmulPreferenceCreate(&preference);
-  status = dyl::cublasLtMatmulPreferenceSetAttribute(
-      preference,
-      CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
-      &work_space_size,
-      sizeof(work_space_size));
-
-  status = dyl::cublasLtMatmulAlgoGetHeuristic(dev_ctx.cublaslt_handle(),
-                                               matmul_desc_,
-                                               B_desc_,
-                                               A_desc_,
-                                               C_desc_,
-                                               C_desc_,
-                                               preference,
-                                               1,
-                                               &heuristicResult,
-                                               &returnedResults);
-
-  PADDLE_ENFORCE_NE(returnedResults,
-                    0,
-                    common::errors::NotFound(
-                        "Unable to find suitable cuBLAS GEMM algorithm"));
-
-  status =
-      dyl::cublasLtMatmul(dev_ctx.cublaslt_handle(),
-                          matmul_desc_,
-                          &alpha_,
-                          mat_b.data<phi::dtype::float8_e4m3fn>(),
-                          B_desc_,
-                          mat_a.data<phi::dtype::float8_e4m3fn>(),
-                          A_desc_,
-                          &beta_,
-                          out->data<T>(),
-                          C_desc_,
-                          out->data<T>(),
-                          C_desc_,
-                          // nullptr,
-                          &heuristicResult.algo,
-                          //  nullptr,
-                          reinterpret_cast<void*>(workspace->data<int8_t>()),
-                          // 0,
-                          work_space_size,
-                          dev_ctx.stream());
-}
-#endif
-
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/funcs/blas/port.cc b/backends/metax_gpu/kernels/funcs/blas/port.cc
deleted file mode 100644
index bc6d54e5c5f..00000000000
--- a/backends/metax_gpu/kernels/funcs/blas/port.cc
+++ /dev/null
@@ -1,163 +0,0 @@
-// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights
-// Reserved. Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// clang-format off
-#include "port.h" // NOLINT
-
-#include <array>
-#include <memory>
-#include <stdexcept>
-#include <string>
-#include "glog/logging.h"
-#if !defined(_WIN32)
-#include <dlfcn.h>  // dladdr
-#include <sys/stat.h>
-#include <sys/time.h>
-
-#else
-#include <numeric>  // std::accumulate in msvc
-// clang-format on
-void *dlsym(void *handle, const char *symbol_name) {
-  FARPROC found_symbol;
-  found_symbol = GetProcAddress((HMODULE)handle, symbol_name);
-
-  if (found_symbol == NULL) {
-    LOG(ERROR) << "Load symbol " << symbol_name << " failed.";
-    throw std::runtime_error(std::string(symbol_name) + " not found.");
-  }
-  return reinterpret_cast<void *>(found_symbol);
-}
-
-void *dlopen(const char *filename, int flag) {
-  std::string file_name(filename);
-  HMODULE hModule = LoadLibrary(file_name.c_str());
-  if (!hModule) {
-    if (flag) {
-      throw std::runtime_error(file_name + " not found.");
-    } else {
-      return nullptr;
-    }
-  }
-  return reinterpret_cast<void *>(hModule);
-}
-
-int gettimeofday(struct timeval *tp, void *tzp) {
-  time_t clock;
-  struct tm tm;
-  SYSTEMTIME wtm;
-
-  GetLocalTime(&wtm);
-  tm.tm_year = wtm.wYear - 1900;
-  tm.tm_mon = wtm.wMonth - 1;
-  tm.tm_mday = wtm.wDay;
-  tm.tm_hour = wtm.wHour;
-  tm.tm_min = wtm.wMinute;
-  tm.tm_sec = wtm.wSecond;
-  tm.tm_isdst = -1;
-  clock = mktime(&tm);
-  tp->tv_sec = clock;
-  tp->tv_usec = wtm.wMilliseconds * 1000;
-
-  return (0);
-}
-#endif  // !_WIN32
-
-void ExecShellCommand(const std::string &cmd, std::string *message) {
-  std::array<char, 128> buffer;
-#if !defined(_WIN32)
-  std::shared_ptr<FILE> pipe(popen(cmd.c_str(), "r"), pclose);
-#else
-  std::shared_ptr<FILE> pipe(_popen(cmd.c_str(), "r"), _pclose);
-#endif  // _WIN32
-  if (!pipe) {
-    LOG(ERROR) << "error running command: " << cmd;
-    return;
-  }
-  while (!feof(pipe.get())) {
-    if (fgets(buffer.data(), 128, pipe.get()) != nullptr) {
-      *message += buffer.data();
-    }
-  }
-}
-
-bool PathExists(const std::string &path) {
-#if !defined(_WIN32)
-  struct stat statbuf;
-  if (stat(path.c_str(), &statbuf) != -1) {
-    if (S_ISDIR(statbuf.st_mode)) {
-      return true;
-    }
-  }
-#else
-  struct _stat statbuf;
-  if (_stat(path.c_str(), &statbuf) != -1) {
-    if (S_ISDIR(statbuf.st_mode)) {
-      return true;
-    }
-  }
-#endif  // !_WIN32
-  return false;
-}
-
-#if !defined(_WIN32)
-constexpr char kSEP = '/';
-#else
-constexpr char kSEP = '\\';
-#endif  // _WIN32
-
-bool FileExists(const std::string &filepath) {
-#if !defined(_WIN32)
-  struct stat buffer;
-  return (stat(filepath.c_str(), &buffer) == 0);
-#else
-  struct _stat buffer;
-  return (_stat(filepath.c_str(), &buffer) == 0);
-#endif  // !_WIN32
-}
-
-std::string DirName(const std::string &filepath) {
-  auto pos = filepath.rfind(kSEP);
-  if (pos == std::string::npos) {
-    return "";
-  }
-  return filepath.substr(0, pos);
-}
-
-void MkDir(const char *path) {
-  std::string path_error(path);
-  path_error += " mkdir failed!";
-#if !defined(_WIN32)
-  if (mkdir(path, 0755)) {
-    if (errno != EEXIST) {
-      throw std::runtime_error(path_error);
-    }
-  }
-#else
-  BOOL return_value = CreateDirectory(path, NULL);
-  if (!return_value) {
-    auto errorno = GetLastError();
-    if (errorno != ERROR_ALREADY_EXISTS) {
-      throw std::runtime_error(path_error);
-    }
-  }
-#endif  // !_WIN32
-}
-
-void MkDirRecursively(const char *fullpath) {
-  if (*fullpath == '\0') return;  // empty string
-  if (FileExists(fullpath)) return;
-
-  MkDirRecursively(DirName(fullpath).c_str());
-  MkDir(fullpath);
-}
diff --git a/backends/metax_gpu/kernels/funcs/blas/port.h b/backends/metax_gpu/kernels/funcs/blas/port.h
deleted file mode 100644
index d2a59199bb7..00000000000
--- a/backends/metax_gpu/kernels/funcs/blas/port.h
+++ /dev/null
@@ -1,61 +0,0 @@
-// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights
-// Reserved. Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-
-#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
-
-#if !defined(_WIN32)
-#include <dlfcn.h>  // dladdr
-#include <sys/time.h>
-
-#else
-#ifndef NOMINMAX
-#define NOMINMAX  // msvc max/min macro conflict with std::min/max
-#endif
-// solve static linking error in windows
-// https://github.com/google/glog/issues/301
-#define GOOGLE_GLOG_DLL_DECL
-#include <io.h>  // _popen, _pclose
-#include <stdio.h>
-#include <windows.h>
-#include <winsock.h>
-
-#ifndef S_ISDIR  // windows port for sys/stat.h
-#define S_ISDIR(mode) (((mode)&S_IFMT) == S_IFDIR)
-#endif  // S_ISDIR
-
-void *dlsym(void *handle, const char *symbol_name);
-
-void *dlopen(const char *filename, int flag);
-
-int gettimeofday(struct timeval *tp, void *tzp);
-#endif  // !_WIN32
-
-void ExecShellCommand(const std::string &cmd, std::string *message);
-
-bool PathExists(const std::string &path);
-
-// TODO(yuyang18): If the functions below are needed by other files, move them
-// to paddle::filesystem namespace.
-bool FileExists(const std::string &filepath);
-
-std::string DirName(const std::string &filepath);
-
-void MkDir(const char *path);
-
-void MkDirRecursively(const char *fullpath);
diff --git a/backends/metax_gpu/kernels/funcs/layer_norm_util.h b/backends/metax_gpu/kernels/funcs/layer_norm_util.h
index 3e16e615b1d..0f8210d8b8f 100644
--- a/backends/metax_gpu/kernels/funcs/layer_norm_util.h
+++ b/backends/metax_gpu/kernels/funcs/layer_norm_util.h
@@ -18,7 +18,7 @@
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/device_context.h"
-#include "../funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 // clang-format on
 namespace phi {
diff --git a/backends/metax_gpu/kernels/funcs/quant_dequant.h b/backends/metax_gpu/kernels/funcs/quant_dequant.h
deleted file mode 100644
index 301ae351c40..00000000000
--- a/backends/metax_gpu/kernels/funcs/quant_dequant.h
+++ /dev/null
@@ -1,430 +0,0 @@
-// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights
-// Reserved.
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-// clang-format off
-#include <vector>
-#include "paddle/common/hostdevice.h"
-#include "paddle/phi/backends/gpu/gpu_launch_config.h"
-#include "paddle/phi/common/float16.h"
-#include "paddle/phi/common/transform.h"
-#include "paddle/phi/kernels/funcs/aligned_vector.h"
-#include "blas/blas.h"
-// clang-format on
-namespace phi {
-
-using backends::gpu::GpuLaunchConfig;
-
-constexpr int DequantKernelVecSize = 4;
-
-template <typename T>
-inline HOSTDEVICE T roundWithTiesToEven(T x) {
-  T xLower = floor(x);
-  T xUpper = ceil(x);
-  // x is in interval [xl,xu]. Choose closest of two bounds, breaking ties to
-  // even.
-  T dLower = x - xLower;
-  T dUpper = xUpper - x;
-  return static_cast<T>(
-      (dLower == dUpper ? fmod(xLower, 2.0F) == 0.0F : dLower < dUpper)
-          ? xLower
-          : xUpper);
-}
-
-template <typename T>
-inline HOSTDEVICE T roundWithTiesAwayFromZero(T x) {
-  return static_cast<T>(x > 0 ? ceil(x) : floor(x));
-}
-
-template <typename T>
-__forceinline__ __device__ int8_t quant_helper(const T input,
-                                               const float scale,
-                                               const int round_type,
-                                               const float max_bound,
-                                               const float min_bound) {
-  float quant_value = max_bound * scale * static_cast<float>(input);
-
-  if (round_type == 0) {
-    quant_value = static_cast<float>(roundWithTiesToEven(quant_value));
-  } else {
-    quant_value = static_cast<float>(round(quant_value));
-  }
-  quant_value = quant_value > max_bound ? max_bound : quant_value;
-  quant_value = quant_value < min_bound ? min_bound : quant_value;
-  return static_cast<int8_t>(quant_value);
-}
-
-template <typename T>
-__forceinline__ __device__ int8_t
-quant_helper_ties_to_even_or_away_from_zero(const T input,
-                                            const float scale,
-                                            const int round_type,
-                                            const float max_bound,
-                                            const float min_bound) {
-  float quant_value = max_bound * scale * static_cast<float>(input);
-
-  if (round_type == 0) {
-    quant_value = static_cast<float>(roundWithTiesToEven(quant_value));
-  } else {
-    quant_value = static_cast<float>(roundWithTiesAwayFromZero(quant_value));
-  }
-  quant_value = quant_value > max_bound ? max_bound : quant_value;
-  quant_value = quant_value < min_bound ? min_bound : quant_value;
-  return static_cast<int8_t>(quant_value);
-}
-
-template <typename T>
-__global__ void QuantKernel(const T* input,
-                            char4* output,
-                            const float scale,
-                            const int m,
-                            const int n,
-                            const int round_type,
-                            const float max_bound,
-                            const float min_bound) {
-  int n_id = (blockIdx.x * blockDim.x + threadIdx.x) << 2;
-  int m_id = blockIdx.y * blockDim.y + threadIdx.y;
-
-  bool check = ((m_id < m) && (n_id < n));
-  if (check) {
-    char4 tmp;
-    tmp.x = quant_helper(
-        input[m_id * n + n_id], scale, round_type, max_bound, min_bound);
-    tmp.y = quant_helper(
-        input[m_id * n + n_id + 1], scale, round_type, max_bound, min_bound);
-    tmp.z = quant_helper(
-        input[m_id * n + n_id + 2], scale, round_type, max_bound, min_bound);
-    tmp.w = quant_helper(
-        input[m_id * n + n_id + 3], scale, round_type, max_bound, min_bound);
-    output[(m_id * n + n_id) >> 2] = tmp;
-  }
-}
-
-template <typename T>
-__global__ void QuantKernelWithVecSize(const T* input,
-                                       char4* output,
-                                       const float scale,
-                                       const int m,
-                                       const int n,
-                                       const int round_type,
-                                       const float max_bound,
-                                       const float min_bound) {
-  int n_id = (blockIdx.x * blockDim.x + threadIdx.x) << 2;
-  int m_id = blockIdx.y * blockDim.y + threadIdx.y;
-
-  bool check = ((m_id < m) && (n_id < n));
-  if (check) {
-    char4 tmp;
-    tmp.x = quant_helper_ties_to_even_or_away_from_zero(
-        input[m_id * n + n_id], scale, round_type, max_bound, min_bound);
-    tmp.y = quant_helper_ties_to_even_or_away_from_zero(
-        input[m_id * n + n_id + 1], scale, round_type, max_bound, min_bound);
-    tmp.z = quant_helper_ties_to_even_or_away_from_zero(
-        input[m_id * n + n_id + 2], scale, round_type, max_bound, min_bound);
-    tmp.w = quant_helper_ties_to_even_or_away_from_zero(
-        input[m_id * n + n_id + 3], scale, round_type, max_bound, min_bound);
-    output[(m_id * n + n_id) >> 2] = tmp;
-  }
-}
-
-template <typename T>
-__global__ void QuantKernelWithVecSize(const T* input,
-                                       char3* output,
-                                       const float scale,
-                                       const int m,
-                                       const int n,
-                                       const int round_type,
-                                       const float max_bound,
-                                       const float min_bound) {
-  int n_id = (blockIdx.x * blockDim.x + threadIdx.x) * 3;
-  int m_id = blockIdx.y * blockDim.y + threadIdx.y;
-
-  bool check = ((m_id < m) && (n_id < n));
-  if (check) {
-    char3 tmp;
-    tmp.x = quant_helper_ties_to_even_or_away_from_zero(
-        input[m_id * n + n_id], scale, round_type, max_bound, min_bound);
-    tmp.y = quant_helper_ties_to_even_or_away_from_zero(
-        input[m_id * n + n_id + 1], scale, round_type, max_bound, min_bound);
-    tmp.z = quant_helper_ties_to_even_or_away_from_zero(
-        input[m_id * n + n_id + 2], scale, round_type, max_bound, min_bound);
-    output[(m_id * n + n_id) / 3] = tmp;
-  }
-}
-
-template <typename T>
-__global__ void QuantKernelWithVecSize(const T* input,
-                                       char2* output,
-                                       const float scale,
-                                       const int m,
-                                       const int n,
-                                       const int round_type,
-                                       const float max_bound,
-                                       const float min_bound) {
-  int n_id = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
-  int m_id = blockIdx.y * blockDim.y + threadIdx.y;
-
-  bool check = ((m_id < m) && (n_id < n));
-  if (check) {
-    char2 tmp;
-    tmp.x = quant_helper_ties_to_even_or_away_from_zero(
-        input[m_id * n + n_id], scale, round_type, max_bound, min_bound);
-    tmp.y = quant_helper_ties_to_even_or_away_from_zero(
-        input[m_id * n + n_id + 1], scale, round_type, max_bound, min_bound);
-    output[(m_id * n + n_id) >> 1] = tmp;
-  }
-}
-
-template <typename T>
-__global__ void QuantKernelWithVecSize(const T* input,
-                                       char* output,
-                                       const float scale,
-                                       const int m,
-                                       const int n,
-                                       const int round_type,
-                                       const float max_bound,
-                                       const float min_bound) {
-  int n_id = (blockIdx.x * blockDim.x + threadIdx.x);
-  int m_id = blockIdx.y * blockDim.y + threadIdx.y;
-
-  bool check = ((m_id < m) && (n_id < n));
-  if (check) {
-    char tmp;
-    tmp = quant_helper_ties_to_even_or_away_from_zero(
-        input[m_id * n + n_id], scale, round_type, max_bound, min_bound);
-    output[m_id * n + n_id] = tmp;
-  }
-}
-
-template <typename T>
-void LaunchQuantKernel(const T* input,
-                       int8_t* output,
-                       const float scale,
-                       const int m,
-                       const int n,
-                       const int round_type,
-                       const float max_bound,
-                       const float min_bound,
-                       gpuStream_t stream) {
-  // TODO(minghaoBD): optimize the kennel launch times when m==1 or n==1
-#ifdef PADDLE_WITH_HIP
-  dim3 grid(((n >> 2) + 63) / 64, (m + 7) / 8);
-  dim3 block(64, 8);
-#else
-  dim3 grid(((n >> 2) + 31) / 32, (m + 31) / 32);
-  dim3 block(32, 32);
-#endif
-
-  QuantKernel<<<grid, block, 0, stream>>>(input,
-                                          (char4*)output,  // NOLINT
-                                          scale,
-                                          m,
-                                          n,
-                                          round_type,
-                                          max_bound,
-                                          min_bound);
-}
-
-template <typename T>
-void LaunchQuantKernelWithVecSize(const T* input,
-                                  int8_t* output,
-                                  const float scale,
-                                  const int m,
-                                  const int n,
-                                  const int round_type,
-                                  const float max_bound,
-                                  const float min_bound,
-                                  gpuStream_t stream) {
-  int vec_size = 1;
-  if (n % 4 == 0) {
-    vec_size = 4;
-  } else if (n % 3 == 0) {
-    vec_size = 3;
-  } else if (n % 2 == 0) {
-    vec_size = 2;
-  }
-
-#ifdef PADDLE_WITH_HIP
-  dim3 grid(((n / vec_size) + 63) / 64, (m + 7) / 8);
-  dim3 block(64, 8);
-#else
-  dim3 grid(((n / vec_size) + 31) / 32, (m + 31) / 32);
-  dim3 block(32, 32);
-#endif
-
-  switch (vec_size) {
-    case 4:
-      QuantKernelWithVecSize<<<grid, block, 0, stream>>>(
-          input,
-          reinterpret_cast<char4*>(output),
-          scale,
-          m,
-          n,
-          round_type,
-          max_bound,
-          min_bound);
-      break;
-    case 3:
-      QuantKernelWithVecSize<<<grid, block, 0, stream>>>(
-          input,
-          reinterpret_cast<char3*>(output),
-          scale,
-          m,
-          n,
-          round_type,
-          max_bound,
-          min_bound);
-      break;
-    case 2:
-      QuantKernelWithVecSize<<<grid, block, 0, stream>>>(
-          input,
-          reinterpret_cast<char2*>(output),
-          scale,
-          m,
-          n,
-          round_type,
-          max_bound,
-          min_bound);
-      break;
-    case 1:
-      QuantKernelWithVecSize<<<grid, block, 0, stream>>>(
-          input,
-          reinterpret_cast<char*>(output),
-          scale,
-          m,
-          n,
-          round_type,
-          max_bound,
-          min_bound);
-      break;
-    default:
-      return;
-  }
-}
-
-template <typename T, int VecSize>
-__global__ void DequantKernel(T* output,
-                              const int32_t* input,
-                              const int m,  // batch size
-                              const int n,  // hidden
-                              const float quant_in_scale,
-                              const float* dequant_out_scale_data) {
-  int numel = m * n;
-  int stride = blockDim.x * gridDim.x * VecSize;
-  int idx = (blockIdx.x * blockDim.x + threadIdx.x) * VecSize;
-  int col_id = idx % n;
-
-  phi::AlignedVector<int32_t, VecSize> in_vec;
-  phi::AlignedVector<float, VecSize> out_scale_vec;
-  phi::AlignedVector<T, VecSize> out_vec;
-
-  for (; idx < numel; idx += stride) {
-    phi::Load<int32_t, VecSize>(input + idx, &in_vec);
-    phi::Load<float, VecSize>(dequant_out_scale_data + col_id, &out_scale_vec);
-
-#pragma unroll
-    for (int i = 0; i < VecSize; ++i) {
-      out_vec[i] =
-          static_cast<T>(static_cast<float>(in_vec[i]) * out_scale_vec[i]);
-    }
-
-    phi::Store<T, VecSize>(out_vec, output + idx);
-  }
-}
-
-template <typename T>
-void LaunchDequantKernel(const int32_t* input,
-                         T* output,
-                         const int m,  // m
-                         const int n,  // n
-                         gpuStream_t stream,
-                         GpuLaunchConfig* gpu_config,
-                         const float quant_in_scale,
-                         const float* dequant_out_scale_data) {
-  DequantKernel<T, DequantKernelVecSize>
-      <<<gpu_config->block_per_grid, gpu_config->thread_per_block, 0, stream>>>(
-          output, input, m, n, quant_in_scale, dequant_out_scale_data);
-}
-
-template <typename T, int VecSize>
-__global__ void DequantKernelWithScaleOfInputAndWeight(
-    T* output,
-    const int32_t* input,
-    const int m,  // batch size
-    const int n,  // hidden
-    const float quant_in_scale,
-    const float* quant_weight_scale,
-    float quant_max_bound) {
-  int numel = m * n;
-  int stride = blockDim.x * gridDim.x * VecSize;
-  int idx = (blockIdx.x * blockDim.x + threadIdx.x) * VecSize;
-  int col_id = idx % n;
-
-  phi::AlignedVector<int32_t, VecSize> in_vec;
-  phi::AlignedVector<float, VecSize> out_scale_vec;
-  phi::AlignedVector<T, VecSize> out_vec;
-
-  for (; idx < numel; idx += stride) {
-    phi::Load<int32_t, VecSize>(input + idx, &in_vec);
-    phi::Load<float, VecSize>(quant_weight_scale + col_id, &out_scale_vec);
-
-#pragma unroll
-    for (int i = 0; i < VecSize; ++i) {
-      out_vec[i] = static_cast<T>(static_cast<float>(in_vec[i]) /
-                                  (quant_max_bound * quant_max_bound *
-                                   quant_in_scale * out_scale_vec[i]));
-    }
-
-    phi::Store<T, VecSize>(out_vec, output + idx);
-  }
-}
-
-template <typename T>
-void LaunchDequantKernelWithScaleOfInputAndWeight(
-    const int32_t* input,
-    T* output,
-    const int m,  // m
-    const int n,  // n
-    gpuStream_t stream,
-    GpuLaunchConfig* gpu_config,
-    const float quant_in_scale,
-    const float* quant_weight_scale,
-    float quant_max_bound) {
-  if (n % DequantKernelVecSize != 0) {
-    DequantKernelWithScaleOfInputAndWeight<T, 1><<<gpu_config->block_per_grid,
-                                                   gpu_config->thread_per_block,
-                                                   0,
-                                                   stream>>>(output,
-                                                             input,
-                                                             m,
-                                                             n,
-                                                             quant_in_scale,
-                                                             quant_weight_scale,
-                                                             quant_max_bound);
-    return;
-  }
-  DequantKernelWithScaleOfInputAndWeight<T, DequantKernelVecSize>
-      <<<gpu_config->block_per_grid, gpu_config->thread_per_block, 0, stream>>>(
-          output,
-          input,
-          m,
-          n,
-          quant_in_scale,
-          quant_weight_scale,
-          quant_max_bound);
-}
-
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/gpudnn/cudnn.cc b/backends/metax_gpu/kernels/gpudnn/cudnn.cc
deleted file mode 100644
index dc403282c1c..00000000000
--- a/backends/metax_gpu/kernels/gpudnn/cudnn.cc
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/backends/dynload/cudnn.h"  // NOLINT
-
-#include "paddle/phi/core/enforce.h"
-
-namespace phi::dynload {
-
-std::once_flag cudnn_dso_flag;
-void* cudnn_dso_handle = nullptr;
-
-#define DEFINE_WRAP(__name) DynLoad__##__name __name
-
-CUDNN_DNN_ROUTINE_EACH(DEFINE_WRAP);
-
-#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8
-CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(DEFINE_WRAP);
-#endif
-
-#ifdef CUDNN_DNN_ROUTINE_EACH_R7
-CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
-#endif
-
-#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7
-CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DEFINE_WRAP);
-#endif
-
-#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R7
-CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP);
-#endif
-
-#ifdef CUDNN_DNN_ROUTINE_EACH_R8
-CUDNN_DNN_ROUTINE_EACH_R8(DEFINE_WRAP);
-#endif
-
-#ifdef CUDNN_DNN_ROUTINE_EACH_FRONTEND
-CUDNN_DNN_ROUTINE_EACH_FRONTEND(DEFINE_WRAP);
-#endif
-
-#ifdef CUDNN_DNN_ROUTINE_EACH_REMOVED_IN_E9
-CUDNN_DNN_ROUTINE_EACH_REMOVED_IN_E9(DEFINE_WRAP);
-#endif
-
-#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7_REMOVED_IN_E9
-CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7_REMOVED_IN_E9(DEFINE_WRAP);
-#endif
-
-#ifdef CUDNN_DNN_ROUTINE_EACH_R9
-CUDNN_DNN_ROUTINE_EACH_R9(DEFINE_WRAP);
-#endif
-
-bool HasCUDNN() {
-  std::call_once(cudnn_dso_flag,
-                 []() { cudnn_dso_handle = GetCUDNNDsoHandle(); });
-  return cudnn_dso_handle != nullptr;
-}
-
-void EnforceCUDNNLoaded(const char* fn_name) {
-  PADDLE_ENFORCE_NOT_NULL(
-      cudnn_dso_handle,
-      common::errors::PreconditionNotMet(
-          "Cannot load cudnn shared library. Cannot invoke method %s.",
-          fn_name));
-}
-
-}  // namespace phi::dynload
diff --git a/backends/metax_gpu/kernels/gpudnn/cudnn.h b/backends/metax_gpu/kernels/gpudnn/cudnn.h
deleted file mode 100644
index 65cb6b338b7..00000000000
--- a/backends/metax_gpu/kernels/gpudnn/cudnn.h
+++ /dev/null
@@ -1,218 +0,0 @@
-// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights
-// Reserved.
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#ifdef PADDLE_WITH_CUDA
-#include <cudnn.h>
-
-#include <mutex>  // NOLINT
-
-#include "paddle/phi/backends/dynload/dynamic_loader.h"
-#include "paddle/phi/common/port.h"
-
-namespace phi {
-namespace dynload {
-
-extern std::once_flag cudnn_dso_flag;
-extern void* cudnn_dso_handle;
-extern bool HasCUDNN();
-
-extern void EnforceCUDNNLoaded(const char* fn_name);
-#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)                      \
-  struct DynLoad__##__name {                                         \
-    template <typename... Args>                                      \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
-      using cudnn_func = decltype(&::__name);                        \
-      std::call_once(cudnn_dso_flag, []() {                          \
-        cudnn_dso_handle = phi::dynload::GetCUDNNDsoHandle();        \
-      });                                                            \
-      EnforceCUDNNLoaded(#__name);                                   \
-      std::string replaced_name = #__name;                           \
-      replaced_name = replaced_name.replace(0, 2, "mc");             \
-      static void* p_##__name =                                      \
-          dlsym(cudnn_dso_handle, replaced_name.c_str());            \
-      return reinterpret_cast<cudnn_func>(p_##__name)(args...);      \
-    }                                                                \
-  };                                                                 \
-  extern struct DynLoad__##__name __name
-
-/**
- * include all needed cudnn functions in HPPL
- * different cudnn version has different interfaces
- **/
-#define CUDNN_DNN_ROUTINE_EACH(__macro)                    \
-  __macro(cudnnSetTensor4dDescriptor);                     \
-  __macro(cudnnSetTensor4dDescriptorEx);                   \
-  __macro(cudnnSetTensorNdDescriptor);                     \
-  __macro(cudnnGetTensorNdDescriptor);                     \
-  __macro(cudnnGetConvolutionNdForwardOutputDim);          \
-  __macro(cudnnCreateTensorDescriptor);                    \
-  __macro(cudnnDestroyTensorDescriptor);                   \
-  __macro(cudnnCreateFilterDescriptor);                    \
-  __macro(cudnnSetFilter4dDescriptor);                     \
-  __macro(cudnnSetFilterNdDescriptor);                     \
-  __macro(cudnnGetFilterNdDescriptor);                     \
-  __macro(cudnnSetPooling2dDescriptor);                    \
-  __macro(cudnnSetPoolingNdDescriptor);                    \
-  __macro(cudnnGetPoolingNdDescriptor);                    \
-  __macro(cudnnDestroyFilterDescriptor);                   \
-  __macro(cudnnCreateConvolutionDescriptor);               \
-  __macro(cudnnCreatePoolingDescriptor);                   \
-  __macro(cudnnDestroyPoolingDescriptor);                  \
-  __macro(cudnnSetConvolution2dDescriptor);                \
-  __macro(cudnnDestroyConvolutionDescriptor);              \
-  __macro(cudnnSetConvolutionNdDescriptor);                \
-  __macro(cudnnGetConvolutionNdDescriptor);                \
-  __macro(cudnnDeriveBNTensorDescriptor);                  \
-  __macro(cudnnCreateSpatialTransformerDescriptor);        \
-  __macro(cudnnSetSpatialTransformerNdDescriptor);         \
-  __macro(cudnnDestroySpatialTransformerDescriptor);       \
-  __macro(cudnnSpatialTfGridGeneratorForward);             \
-  __macro(cudnnSpatialTfGridGeneratorBackward);            \
-  __macro(cudnnSpatialTfSamplerForward);                   \
-  __macro(cudnnSpatialTfSamplerBackward);                  \
-  __macro(cudnnCreate);                                    \
-  __macro(cudnnDestroy);                                   \
-  __macro(cudnnSetStream);                                 \
-  __macro(cudnnActivationForward);                         \
-  __macro(cudnnActivationBackward);                        \
-  __macro(cudnnConvolutionForward);                        \
-  __macro(cudnnConvolutionBackwardBias);                   \
-  __macro(cudnnGetConvolutionForwardWorkspaceSize);        \
-  __macro(cudnnTransformTensor);                           \
-  __macro(cudnnPoolingForward);                            \
-  __macro(cudnnPoolingBackward);                           \
-  __macro(cudnnSoftmaxBackward);                           \
-  __macro(cudnnSoftmaxForward);                            \
-  __macro(cudnnGetVersion);                                \
-  __macro(cudnnFindConvolutionForwardAlgorithmEx);         \
-  __macro(cudnnFindConvolutionBackwardFilterAlgorithmEx);  \
-  __macro(cudnnFindConvolutionBackwardFilterAlgorithm);    \
-  __macro(cudnnFindConvolutionBackwardDataAlgorithmEx);    \
-  __macro(cudnnGetErrorString);                            \
-  __macro(cudnnCreateDropoutDescriptor);                   \
-  __macro(cudnnDropoutGetStatesSize);                      \
-  __macro(cudnnSetDropoutDescriptor);                      \
-  __macro(cudnnRestoreDropoutDescriptor);                  \
-  __macro(cudnnCreateRNNDescriptor);                       \
-  __macro(cudnnGetRNNParamsSize);                          \
-  __macro(cudnnGetRNNWorkspaceSize);                       \
-  __macro(cudnnGetRNNTrainingReserveSize);                 \
-  __macro(cudnnRNNForwardTraining);                        \
-  __macro(cudnnRNNBackwardData);                           \
-  __macro(cudnnRNNBackwardWeights);                        \
-  __macro(cudnnRNNForwardInference);                       \
-  __macro(cudnnDestroyDropoutDescriptor);                  \
-  __macro(cudnnDestroyRNNDescriptor);                      \
-  __macro(cudnnSetTensorNdDescriptorEx);                   \
-  __macro(cudnnAddTensor);                                 \
-  __macro(cudnnConvolutionBackwardData);                   \
-  __macro(cudnnConvolutionBackwardFilter);                 \
-  __macro(cudnnGetConvolutionBackwardFilterWorkspaceSize); \
-  __macro(cudnnGetConvolutionBackwardDataWorkspaceSize);   \
-  __macro(cudnnBatchNormalizationForwardTraining);         \
-  __macro(cudnnBatchNormalizationForwardInference);        \
-  __macro(cudnnBatchNormalizationBackward);                \
-  __macro(cudnnCreateActivationDescriptor);                \
-  __macro(cudnnSetActivationDescriptor);                   \
-  __macro(cudnnGetActivationDescriptor);                   \
-  __macro(cudnnDestroyActivationDescriptor);               \
-  __macro(cudnnSetRNNDescriptor_v6);
-CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-
-#if CUDNN_VERSION >= 7000 && CUDNN_VERSION < 8000
-#define CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(__macro) \
-  __macro(cudnnGetConvolutionBackwardFilterAlgorithm);   \
-  __macro(cudnnGetConvolutionForwardAlgorithm);          \
-  __macro(cudnnGetConvolutionBackwardDataAlgorithm);     \
-  __macro(cudnnSetRNNDescriptor);
-CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-#endif
-
-#if CUDNN_VERSION >= 7001
-#define CUDNN_DNN_ROUTINE_EACH_R7(__macro)                \
-  __macro(cudnnSetConvolutionGroupCount);                 \
-  __macro(cudnnSetConvolutionMathType);                   \
-  __macro(cudnnConvolutionBiasActivationForward);         \
-  __macro(cudnnCreateCTCLossDescriptor);                  \
-  __macro(cudnnDestroyCTCLossDescriptor);                 \
-  __macro(cudnnGetCTCLossDescriptor);                     \
-  __macro(cudnnSetCTCLossDescriptor);                     \
-  __macro(cudnnGetCTCLossWorkspaceSize);                  \
-  __macro(cudnnCTCLoss);                                  \
-  __macro(cudnnGetConvolutionBackwardDataAlgorithm_v7);   \
-  __macro(cudnnGetConvolutionBackwardFilterAlgorithm_v7); \
-  __macro(cudnnGetConvolutionForwardAlgorithm_v7);        \
-  __macro(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount);
-CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-#endif
-
-#if CUDNN_VERSION >= 7201
-#define CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(__macro) \
-  __macro(cudnnCreateRNNDataDescriptor);             \
-  __macro(cudnnDestroyRNNDataDescriptor);            \
-  __macro(cudnnSetRNNDataDescriptor);                \
-  __macro(cudnnSetRNNPaddingMode);                   \
-  __macro(cudnnRNNForwardTrainingEx);                \
-  __macro(cudnnRNNBackwardDataEx);                   \
-  __macro(cudnnRNNBackwardWeightsEx);                \
-  __macro(cudnnRNNForwardInferenceEx);
-CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-#endif
-
-#if CUDNN_VERSION >= 7401
-#define CUDNN_DNN_ROUTINE_EACH_AFTER_R7(__macro)                     \
-  __macro(cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize); \
-  __macro(cudnnBatchNormalizationForwardTrainingEx);                 \
-  __macro(cudnnGetBatchNormalizationBackwardExWorkspaceSize);        \
-  __macro(cudnnBatchNormalizationBackwardEx);                        \
-  __macro(cudnnGetBatchNormalizationTrainingExReserveSpaceSize);
-CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-#endif
-
-#if CUDNN_VERSION >= 8000
-#define CUDNN_DNN_ROUTINE_EACH_R8(__macro)            \
-  __macro(cudnnSetRNNDescriptor_v8);                  \
-  __macro(cudnnCreateFusedOpsPlan);                   \
-  __macro(cudnnCreateFusedOpsConstParamPack);         \
-  __macro(cudnnCreateFusedOpsVariantParamPack);       \
-  __macro(cudnnDestroyFusedOpsPlan);                  \
-  __macro(cudnnDestroyFusedOpsConstParamPack);        \
-  __macro(cudnnDestroyFusedOpsVariantParamPack);      \
-  __macro(cudnnFusedOpsExecute);                      \
-  __macro(cudnnSetFusedOpsConstParamPackAttribute);   \
-  __macro(cudnnSetFusedOpsVariantParamPackAttribute); \
-  __macro(cudnnMakeFusedOpsPlan);
-CUDNN_DNN_ROUTINE_EACH_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-#endif
-
-#ifdef PADDLE_WITH_CUDNN_FRONTEND
-#define CUDNN_DNN_ROUTINE_EACH_FRONTEND(__macro) \
-  __macro(cudnnBackendCreateDescriptor);         \
-  __macro(cudnnBackendDestroyDescriptor);        \
-  __macro(cudnnBackendExecute);                  \
-  __macro(cudnnBackendFinalize);                 \
-  __macro(cudnnBackendGetAttribute);             \
-  __macro(cudnnBackendSetAttribute);             \
-  __macro(cudnnGetStream);                       \
-  __macro(cudnnReorderFilterAndBias);
-CUDNN_DNN_ROUTINE_EACH_FRONTEND(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-#endif
-
-}  // namespace dynload
-}  // namespace phi
-
-#endif
diff --git a/backends/metax_gpu/kernels/impl/addmm_kernel_impl.h b/backends/metax_gpu/kernels/impl/addmm_kernel_impl.h
index b517b719d49..a2c69b6adf0 100644
--- a/backends/metax_gpu/kernels/impl/addmm_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/addmm_kernel_impl.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "glog/logging.h"
 
 #include "paddle/phi/kernels/addmm_kernel.h"
-#include "../funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 // clang-format on
diff --git a/backends/metax_gpu/kernels/impl/baddbmm_kernel_impl.h b/backends/metax_gpu/kernels/impl/baddbmm_kernel_impl.h
index 593c044fc76..1c52ea22e4e 100644
--- a/backends/metax_gpu/kernels/impl/baddbmm_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/baddbmm_kernel_impl.h
@@ -17,9 +17,9 @@ limitations under the License. */
 #include <type_traits>
 
 #include "glog/logging.h"
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/kernels/baddbmm_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 
diff --git a/backends/metax_gpu/kernels/impl/bilinear_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/bilinear_grad_kernel_impl.h
index ef61d48202f..b64f94bc7ef 100644
--- a/backends/metax_gpu/kernels/impl/bilinear_grad_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/bilinear_grad_kernel_impl.h
@@ -14,8 +14,8 @@
 
 #pragma once
 
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 
 namespace phi {
diff --git a/backends/metax_gpu/kernels/impl/bilinear_kernel_impl.h b/backends/metax_gpu/kernels/impl/bilinear_kernel_impl.h
index c124e84eb6d..48861d48932 100644
--- a/backends/metax_gpu/kernels/impl/bilinear_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/bilinear_kernel_impl.h
@@ -14,8 +14,8 @@
 
 #pragma once
 
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/utils/optional.h"
 
diff --git a/backends/metax_gpu/kernels/impl/bmm_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/bmm_grad_kernel_impl.h
index 543df3ee964..cd5978ae59f 100644
--- a/backends/metax_gpu/kernels/impl/bmm_grad_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/bmm_grad_kernel_impl.h
@@ -14,9 +14,9 @@
 
 #pragma once
 
-#include "kernels/funcs/blas/blas.h"
-#include "kernels/impl/matmul_grad_kernel_impl.h"
 #include "paddle/phi/kernels/bmm_grad_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/impl/matmul_grad_kernel_impl.h"
 
 namespace phi {
 
diff --git a/backends/metax_gpu/kernels/impl/bmm_kernel_impl.h b/backends/metax_gpu/kernels/impl/bmm_kernel_impl.h
index 7b4164032b2..ce493b4908a 100644
--- a/backends/metax_gpu/kernels/impl/bmm_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/bmm_kernel_impl.h
@@ -14,8 +14,8 @@
 
 #pragma once
 
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/bmm_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 
 namespace phi {
 
diff --git a/backends/metax_gpu/kernels/impl/cholesky_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/cholesky_grad_kernel_impl.h
index 02332652660..5d146dae8d5 100644
--- a/backends/metax_gpu/kernels/impl/cholesky_grad_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/cholesky_grad_kernel_impl.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/cholesky_grad_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
 
 namespace phi {
diff --git a/backends/metax_gpu/kernels/impl/cholesky_solve_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/cholesky_solve_grad_kernel_impl.h
index 62115e9ee6a..098092767c4 100644
--- a/backends/metax_gpu/kernels/impl/cholesky_solve_grad_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/cholesky_solve_grad_kernel_impl.h
@@ -14,7 +14,6 @@
 
 #pragma once
 
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/cholesky_solve_grad_kernel.h"
 #include "paddle/phi/kernels/cholesky_solve_kernel.h"
 #include "paddle/phi/kernels/complex_kernel.h"
@@ -22,6 +21,7 @@
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/expand_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
diff --git a/backends/metax_gpu/kernels/impl/conv_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/conv_grad_kernel_impl.h
index 25e0d93a6a4..6066720ab07 100644
--- a/backends/metax_gpu/kernels/impl/conv_grad_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/conv_grad_kernel_impl.h
@@ -14,10 +14,10 @@
 
 #pragma once
 
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/cpu/conv_util.h"
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/batch_norm_utils.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/im2col.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/vol2col.h"
diff --git a/backends/metax_gpu/kernels/impl/conv_kernel_impl.h b/backends/metax_gpu/kernels/impl/conv_kernel_impl.h
index 2cf5fa166e7..4395e5d5782 100644
--- a/backends/metax_gpu/kernels/impl/conv_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/conv_kernel_impl.h
@@ -14,11 +14,11 @@
 
 #pragma once
 
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/conv_kernel.h"
 #include "paddle/phi/kernels/cpu/conv_util.h"
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/batch_norm_utils.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/im2col.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/vol2col.h"
diff --git a/backends/metax_gpu/kernels/impl/conv_transpose_kernel_impl.h b/backends/metax_gpu/kernels/impl/conv_transpose_kernel_impl.h
index c7c002d4e9e..aadc5d2b8a0 100644
--- a/backends/metax_gpu/kernels/impl/conv_transpose_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/conv_transpose_kernel_impl.h
@@ -14,12 +14,12 @@
 
 #pragma once
 
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/common/ddim.h"
 #include "paddle/common/layout.h"
 #include "paddle/phi/kernels/conv_transpose_kernel.h"
 #include "paddle/phi/kernels/cpu/conv_util.h"
 #include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 #include "paddle/phi/kernels/funcs/im2col.h"
 #include "paddle/phi/kernels/funcs/slice.h"
diff --git a/backends/metax_gpu/kernels/impl/deformable_conv_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/deformable_conv_grad_kernel_impl.h
index d2419966342..b9931a89978 100644
--- a/backends/metax_gpu/kernels/impl/deformable_conv_grad_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/deformable_conv_grad_kernel_impl.h
@@ -14,11 +14,11 @@
 
 #pragma once
 
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/common/hostdevice.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/deformable_conv_functor.h"
 
 namespace phi {
diff --git a/backends/metax_gpu/kernels/impl/elementwise.h b/backends/metax_gpu/kernels/impl/elementwise.h
index 52a7709424b..b9f3d8af1c9 100644
--- a/backends/metax_gpu/kernels/impl/elementwise.h
+++ b/backends/metax_gpu/kernels/impl/elementwise.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
diff --git a/backends/metax_gpu/kernels/impl/flatten2_kernel_impl.h b/backends/metax_gpu/kernels/impl/flatten2_kernel_impl.h
index d4526922c7b..dc4059a7225 100644
--- a/backends/metax_gpu/kernels/impl/flatten2_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/flatten2_kernel_impl.h
@@ -15,10 +15,10 @@
 #pragma once
 #include <vector>
 
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/flatten_grad_kernel.h"
 #include "paddle/phi/kernels/flatten_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/flatten2_utils.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/backends/metax_gpu/kernels/impl/gru_unit_kernel_impl.h b/backends/metax_gpu/kernels/impl/gru_unit_kernel_impl.h
index 0929a327035..ef12141f911 100644
--- a/backends/metax_gpu/kernels/impl/gru_unit_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/gru_unit_kernel_impl.h
@@ -16,10 +16,10 @@
 
 #include <memory>
 
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/activation_functor.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/utils/optional.h"
 namespace phi {
diff --git a/backends/metax_gpu/kernels/impl/index_select_impl.h b/backends/metax_gpu/kernels/impl/index_select_impl.h
index 78284107d34..ac39cab2704 100644
--- a/backends/metax_gpu/kernels/impl/index_select_impl.h
+++ b/backends/metax_gpu/kernels/impl/index_select_impl.h
@@ -15,9 +15,9 @@
 #pragma once
 
 #include "glog/logging.h"
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/backends/metax_gpu/kernels/impl/inverse_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/inverse_grad_kernel_impl.h
index 85aff008b4e..64b56f2cd1c 100644
--- a/backends/metax_gpu/kernels/impl/inverse_grad_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/inverse_grad_kernel_impl.h
@@ -14,10 +14,10 @@
 
 #pragma once
 
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/kernels/complex_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/matrix_inverse.h"
 #include "paddle/phi/kernels/inverse_grad_kernel.h"
 
diff --git a/backends/metax_gpu/kernels/impl/lstm_kernel_impl.h b/backends/metax_gpu/kernels/impl/lstm_kernel_impl.h
index 079548b4ad0..4a061fe4716 100644
--- a/backends/metax_gpu/kernels/impl/lstm_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/lstm_kernel_impl.h
@@ -15,8 +15,8 @@
 #pragma once
 #include <string>
 
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/detail/activation_functions.h"
 #include "paddle/phi/kernels/funcs/lstm_compute.h"
 #include "paddle/phi/kernels/funcs/lstm_utils.h"
diff --git a/backends/metax_gpu/kernels/impl/lu_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/lu_grad_kernel_impl.h
index e9ef47490bc..5a2e5d48a11 100644
--- a/backends/metax_gpu/kernels/impl/lu_grad_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/lu_grad_kernel_impl.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/impl/lu_kernel_impl.h"
 #include "paddle/phi/kernels/triangular_solve_kernel.h"
diff --git a/backends/metax_gpu/kernels/impl/lu_solve_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/lu_solve_grad_kernel_impl.h
index 21c711c53ef..24dee650dfe 100644
--- a/backends/metax_gpu/kernels/impl/lu_solve_grad_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/lu_solve_grad_kernel_impl.h
@@ -15,9 +15,9 @@
 #pragma once
 
 #include "paddle/phi/infermeta/binary.h"
-// #include "paddle/phi/kernels/funcs/blas/blas.h"
+// #include "paddle/phi/paddle/phi/kernels/funcs/blas/blas.h"
 
-#include "kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/matrix_solve.h"
 #include "paddle/phi/kernels/impl/lu_kernel_impl.h"
diff --git a/backends/metax_gpu/kernels/impl/matmul_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/matmul_grad_kernel_impl.h
deleted file mode 100644
index 823851666f1..00000000000
--- a/backends/metax_gpu/kernels/impl/matmul_grad_kernel_impl.h
+++ /dev/null
@@ -1,2042 +0,0 @@
-// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights
-// Reserved.
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-// clang-format off
-#include "glog/logging.h"
-
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/kernels/complex_kernel.h"
-#include "paddle/phi/kernels/empty_kernel.h"
-#include "paddle/phi/kernels/full_kernel.h"
-#include "paddle/phi/kernels/funcs/reduce_function.h"
-#include "paddle/phi/kernels/funcs/reduce_functor.h"
-#include "paddle/phi/kernels/impl/dot_grad_kernel_impl.h"
-// #include "paddle/phi/kernels/impl/matmul_kernel_impl.h"
-#include "paddle/phi/kernels/reduce_sum_kernel.h"
-
-#include "../impl/matmul_kernel_impl.h"
-// clang-format on
-
-#if defined(__NVCC__) || defined(__HIPCC__)
-#include "paddle/phi/kernels/gpu/reduce.h"
-#endif
-
-namespace phi {
-
-template <typename Context, typename T>
-struct ReduceSumForMatmulGrad {
-  void operator()(const Context& dev_ctx,
-                  const DenseTensor& input,
-                  DenseTensor* output,
-                  const std::vector<int>& reduce_dims);
-};
-
-template <typename T>
-struct ReduceSumForMatmulGrad<CPUContext, T> {
-  void operator()(const CPUContext& dev_ctx,
-                  const DenseTensor& input,
-                  DenseTensor* output,
-                  const std::vector<int>& reduce_dims) {
-    std::vector<int64_t> reduce_dims_tmp(reduce_dims.begin(),
-                                         reduce_dims.end());
-    funcs::ReduceKernelImpl<CPUContext, T, T, phi::funcs::SumFunctor>(
-        dev_ctx, input, output, reduce_dims_tmp, true, false);
-  }
-};
-
-#if defined(__NVCC__) || defined(__HIPCC__)
-template <typename T>
-struct ReduceSumForMatmulGrad<GPUContext, T> {
-  void operator()(const GPUContext& dev_ctx,
-                  const DenseTensor& input,
-                  DenseTensor* output,
-                  const std::vector<int>& reduce_dims) {
-    phi::SumKernel<T, GPUContext>(
-        dev_ctx, input, reduce_dims, input.dtype(), false, output);
-  }
-};
-#endif
-
-// Reshape a rank-3 tensor from P x M x N to (P * M) x N.
-// Identity op if the tensor is not of rank 3.
-static DenseTensor FoldInitDims(const DenseTensor& input) {
-  DenseTensor output = input;
-  auto in_dims = input.dims();
-  if (in_dims.size() == 3) {
-    output.Resize({in_dims[0] * in_dims[1], in_dims[2]});
-  }
-  return output;
-}
-
-// Reshape a rank-3 tensor from P x M x N to M x (P * N).
-// (Warning: This requires transposing data and writes into new memory.)
-// Identity op if the tensor is not of rank 3.
-template <typename Context, typename T>
-static DenseTensor FoldHeadAndLastDims(const Context& dev_ctx,
-                                       const DenseTensor& input) {
-  auto in_dims = input.dims();
-  if (in_dims.size() != 3) {
-    return input;
-  }
-  DenseTensor output = EmptyLike<T, Context>(dev_ctx, input);
-  output.Resize({in_dims[1], in_dims[0], in_dims[2]});
-  std::vector<int> axis = {1, 0, 2};
-  funcs::Transpose<Context, T, 3> trans;
-  trans(dev_ctx, input, &output, axis);
-  output.Resize({in_dims[1], in_dims[0] * in_dims[2]});
-  return output;
-}
-
-template <typename Context, typename T>
-typename std::enable_if<!std::is_integral<T>::value>::type MatMul(
-    const Context& dev_ctx,
-    const DenseTensor& a,
-    bool trans_a,
-    const DenseTensor& b,
-    bool trans_b,
-    DenseTensor* out,
-    bool flag = false) {
-  dev_ctx.template Alloc<T>(out);
-  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
-  auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(a.dims(), 0, trans_a);
-  auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(b.dims(), 0, trans_b);
-  if (a.dims().size() == 3 && b.dims().size() <= 2) {
-    // the transpose_X must be false, if is true, the transpose cost much time
-    if (!trans_a) {
-      mat_dim_a.height_ *= mat_dim_a.batch_size_;
-      mat_dim_a.batch_size_ = 0;
-    }
-  }
-  blas.MatMul(a.data<T>(),
-              mat_dim_a,
-              b.data<T>(),
-              mat_dim_b,
-              static_cast<T>(1),
-              dev_ctx.template Alloc<T>(out),
-              static_cast<T>(flag));
-}
-
-/**
- * Get row matrix shape from a vector shape. If the rank of x_dim > 1, the
- * original x_dim is returned.
- */
-static DDim RowMatrixFromVector(const DDim& x_dim) {
-  if (x_dim.size() > 1) {
-    return x_dim;
-  }
-  return common::make_ddim({1, x_dim[0]});
-}
-
-/**
- * Get column matrix shape from a vector shape. If the ran of y_dim > 1, the
- * original y_dim is returned.
- */
-static DDim ColumnMatrixFromVector(const DDim& y_dim) {
-  if (y_dim.size() > 1) {
-    return y_dim;
-  }
-  return common::make_ddim({y_dim[0], 1});
-}
-
-/**
- * Reshape a tensor to 3-D or 2-D tensor by matrix descriptor.
- *
- * The shape would be [BatchSize, H, W] or [H, W].
- * If transposed, `H,W` will be swapped.
- */
-static void ReshapeTensorIntoMatrixSequence(
-    DenseTensor* x, const phi::funcs::MatDescriptor& descriptor) {
-  int64_t h, w;
-  h = descriptor.height_;
-  w = descriptor.width_;
-  if (descriptor.trans_) {
-    std::swap(w, h);
-  }
-  if (descriptor.batch_size_) {
-    x->Resize({descriptor.batch_size_, h, w});
-  } else {
-    x->Resize({h, w});
-  }
-}
-
-static void ReshapeXYOutIntoMatrixSequence(DenseTensor* x,
-                                           DenseTensor* y,
-                                           DenseTensor* out,
-                                           bool trans_x,
-                                           bool trans_y) {
-  auto x_dim = RowMatrixFromVector(x->dims());
-  auto y_dim = ColumnMatrixFromVector(y->dims());
-  auto mat_dim_x = phi::funcs::CreateMatrixDescriptor(x_dim, 0, trans_x);
-  auto mat_dim_y = phi::funcs::CreateMatrixDescriptor(y_dim, 0, trans_y);
-  if (mat_dim_x.batch_size_ == 0 && mat_dim_y.batch_size_ == 0) {
-    out->Resize({mat_dim_x.height_, mat_dim_y.width_});
-  } else {
-    out->Resize({(std::max)(mat_dim_x.batch_size_, mat_dim_y.batch_size_),
-                 mat_dim_x.height_,
-                 mat_dim_y.width_});
-  }
-
-  ReshapeTensorIntoMatrixSequence(x, mat_dim_x);
-  ReshapeTensorIntoMatrixSequence(y, mat_dim_y);
-}
-
-template <typename T, typename Context>
-void CalcInputGrad(const Context& dev_ctx,
-                   const DenseTensor& a,
-                   bool trans_a,
-                   bool is_fold_init_dims_a,
-                   const DenseTensor& b,
-                   bool trans_b,
-                   bool is_fold_init_dims_b,
-                   DenseTensor* out,
-                   bool flag = false) {
-  if (out == nullptr) return;
-  bool need_combine =
-      (a.dims().size() == 3 || b.dims().size() == 3) && out->dims().size() == 2;
-  if (!need_combine) {
-    MatMul<Context, T>(dev_ctx, a, trans_a, b, trans_b, out, flag);
-  } else {
-    MatMul<Context, T>(
-        dev_ctx,
-        is_fold_init_dims_a ? FoldInitDims(a)
-                            : FoldHeadAndLastDims<Context, T>(dev_ctx, a),
-        trans_a,
-        is_fold_init_dims_b ? FoldInitDims(b)
-                            : FoldHeadAndLastDims<Context, T>(dev_ctx, b),
-        trans_b,
-        out,
-        flag);
-  }
-}
-
-template <typename T, typename Context>
-void MatmulGradKernel(const Context& dev_ctx,
-                      const DenseTensor& x,
-                      const DenseTensor& y,
-                      const DenseTensor& out_grad,
-                      bool transpose_x,
-                      bool transpose_y,
-                      DenseTensor* dx,
-                      DenseTensor* dy) {
-  // get dims
-  std::vector<std::int64_t> x_dims = common::vectorize(x.dims());
-  std::vector<std::int64_t> y_dims = common::vectorize(y.dims());
-  std::vector<std::int64_t> dout_dims = common::vectorize(out_grad.dims());
-
-  int x_ndim = x_dims.size();
-  int y_ndim = y_dims.size();
-  int ndim = dout_dims.size();
-
-  // Case1 : x's or y's dim = 1
-  if (x_ndim == 1 && y_ndim == 1) {
-    if (dx) dev_ctx.template Alloc<T>(dx);
-    if (dy) dev_ctx.template Alloc<T>(dy);
-    if (out_grad.numel() == 1) {
-      DotGradFunction<Context, T>()(dev_ctx, &x, &y, &out_grad, dx, dy);
-      return;
-    }
-  }
-
-  bool is_broadcast = true;
-  if (x_ndim <= 2 || y_ndim <= 2) {
-    is_broadcast = false;
-  } else if (x_ndim != y_ndim) {
-    is_broadcast = true;
-  } else {
-    is_broadcast = !std::equal(
-        x_dims.cbegin(), x_dims.cbegin() + x_ndim - 2, y_dims.cbegin());
-  }
-
-  // for complex
-  DenseTensor x_conj;
-  DenseTensor y_conj;
-
-  // Case2: no broadcast or no batch size, it aims to speed and it is same as
-  // matmul in old version.
-  if (!is_broadcast) {
-    DenseTensor x_help = x;
-    DenseTensor y_help = y;
-    DenseTensor out_grad_help = out_grad;
-
-    ReshapeXYOutIntoMatrixSequence(
-        &x_help, &y_help, &out_grad_help, transpose_x, transpose_y);
-
-    DDim dx_dims;
-    if (dx) {
-      dx_dims = dx->dims();
-      if (dx_dims != x_help.dims()) {
-        dx->Resize(x_help.dims());
-      }
-
-      y_conj = Conj<T>(dev_ctx, y_help);
-    }
-
-    DDim dy_dims;
-    if (dy) {
-      dy_dims = dy->dims();
-      if (dy_dims != y_help.dims()) {
-        dy->Resize(y_help.dims());
-      }
-
-      x_conj = Conj<T>(dev_ctx, x_help);
-    }
-
-    if (transpose_x && transpose_y) {
-      CalcInputGrad<T>(
-          dev_ctx, y_conj, true, true, out_grad_help, true, false, dx);
-      CalcInputGrad<T>(
-          dev_ctx, out_grad_help, true, true, x_conj, true, false, dy);
-    } else if (transpose_x) {
-      CalcInputGrad<T>(
-          dev_ctx, y_conj, false, false, out_grad_help, true, false, dx);
-      CalcInputGrad<T>(
-          dev_ctx, x_conj, false, false, out_grad_help, false, true, dy);
-    } else if (transpose_y) {
-      CalcInputGrad<T>(
-          dev_ctx, out_grad_help, false, false, y_conj, false, true, dx);
-      CalcInputGrad<T>(
-          dev_ctx, out_grad_help, true, true, x_conj, false, true, dy);
-    } else {
-      CalcInputGrad<T>(
-          dev_ctx, out_grad_help, false, false, y_conj, true, false, dx);
-      CalcInputGrad<T>(
-          dev_ctx, x_conj, true, true, out_grad_help, false, true, dy);
-    }
-
-    if (dx) {
-      if (dx_dims != x_help.dims()) {
-        dx->Resize(dx_dims);
-      }
-    }
-    if (dy) {
-      if (dy_dims != y_help.dims()) {
-        dy->Resize(dy_dims);
-      }
-    }
-  } else {
-    // Case3: broadcast. It need cost much time to reduce sum for the
-    // broadcast and wastes the memory.
-    // So we should avoid the case in reality.
-    VLOG(3) << "It need cost much time to reduce sum for the broadcast and "
-               "wastes the memory. So we should avoid the case in reality";
-    x_conj = Conj<T>(dev_ctx, x);
-    y_conj = Conj<T>(dev_ctx, y);
-
-    DenseTensor dx_help;
-    DenseTensor dy_help;
-
-    if (transpose_x) {
-      if (transpose_y) {
-        // X'Y': dA = Y'G', dB = G'X'
-        if (dx)
-          MatMulFunction<Context, T>(dev_ctx,
-                                     y_conj,
-                                     out_grad,
-                                     y_dims,
-                                     dout_dims,
-                                     &dx_help,
-                                     true,
-                                     true);
-        if (dy)
-          MatMulFunction<Context, T>(dev_ctx,
-                                     out_grad,
-                                     x_conj,
-                                     dout_dims,
-                                     x_dims,
-                                     &dy_help,
-                                     true,
-                                     true);
-      } else {
-        // X'Y: dX = YG', dY = XG
-        if (dx)
-          MatMulFunction<Context, T>(dev_ctx,
-                                     y_conj,
-                                     out_grad,
-                                     y_dims,
-                                     dout_dims,
-                                     &dx_help,
-                                     false,
-                                     true);
-        if (dy)
-          MatMulFunction<Context, T>(dev_ctx,
-                                     x_conj,
-                                     out_grad,
-                                     x_dims,
-                                     dout_dims,
-                                     &dy_help,
-                                     false,
-                                     false);
-      }
-    } else {
-      if (transpose_y) {
-        // XY': dX = GY, dY = G'X
-        if (dx)
-          MatMulFunction<Context, T>(dev_ctx,
-                                     out_grad,
-                                     y_conj,
-                                     dout_dims,
-                                     y_dims,
-                                     &dx_help,
-                                     false,
-                                     false);
-        if (dy)
-          MatMulFunction<Context, T>(dev_ctx,
-                                     out_grad,
-                                     x_conj,
-                                     dout_dims,
-                                     x_dims,
-                                     &dy_help,
-                                     true,
-                                     false);
-      } else {
-        // XY: dX = GY', dY = X'G
-        if (dx)
-          MatMulFunction<Context, T>(dev_ctx,
-                                     out_grad,
-                                     y_conj,
-                                     dout_dims,
-                                     y_dims,
-                                     &dx_help,
-                                     false,
-                                     true);
-        if (dy)
-          MatMulFunction<Context, T>(dev_ctx,
-                                     x_conj,
-                                     out_grad,
-                                     x_dims,
-                                     dout_dims,
-                                     &dy_help,
-                                     true,
-                                     false);
-      }
-    }
-
-    // get help dims
-    const std::vector<std::int64_t> dx_help_dims =
-        common::vectorize(dx_help.dims());
-    const std::vector<std::int64_t> dy_help_dims =
-        common::vectorize(dy_help.dims());
-
-    std::vector<std::int64_t> dx_broadcast_dims(ndim);
-    std::vector<std::int64_t> dy_broadcast_dims(ndim);
-
-    std::fill(
-        dx_broadcast_dims.data(), dx_broadcast_dims.data() + ndim - x_ndim, 1);
-    std::fill(
-        dy_broadcast_dims.data(), dy_broadcast_dims.data() + ndim - y_ndim, 1);
-    std::copy(x_dims.data(),
-              x_dims.data() + x_ndim,
-              dx_broadcast_dims.data() + ndim - x_ndim);
-    std::copy(y_dims.data(),
-              y_dims.data() + y_ndim,
-              dy_broadcast_dims.data() + ndim - y_ndim);
-
-    std::vector<int> dx_reduce_dims;
-    std::vector<int> dy_reduce_dims;
-    for (int idx = 0; idx <= ndim - 3; idx++) {
-      if (dx_help_dims[idx] != 1 && dx_broadcast_dims[idx] == 1) {
-        dx_reduce_dims.push_back(idx);
-      }
-      if (dy_help_dims[idx] != 1 && dy_broadcast_dims[idx] == 1) {
-        dy_reduce_dims.push_back(idx);
-      }
-    }
-    // reduce sum to get grad by ReduceSum
-    if (dx) {
-      if (dx_reduce_dims.empty()) {
-        *dx = std::move(dx_help);
-      } else {
-        ReduceSumForMatmulGrad<Context, T>()(
-            dev_ctx, dx_help, dx, dx_reduce_dims);
-      }
-      dx->Resize(x.dims());
-    }
-    if (dy) {
-      if (dy_reduce_dims.empty()) {
-        *dy = std::move(dy_help);
-      } else {
-        ReduceSumForMatmulGrad<Context, T>()(
-            dev_ctx, dy_help, dy, dy_reduce_dims);
-      }
-      dy->Resize(y.dims());
-    }
-    // Get the OutputGrad(out)
-  }
-}
-
-template <typename T, typename Context>
-void MatmulDoubleGradKernel(const Context& dev_ctx,
-                            const DenseTensor& x,
-                            const DenseTensor& y,
-                            const DenseTensor& dout,
-                            const paddle::optional<DenseTensor>& ddx,
-                            const paddle::optional<DenseTensor>& ddy,
-                            bool transpose_x,
-                            bool transpose_y,
-                            DenseTensor* dx,
-                            DenseTensor* dy,
-                            DenseTensor* ddout) {
-  // Get dims from the input x, y, output_grad
-  std::vector<std::int64_t> x_dims = common::vectorize(x.dims());
-  std::vector<std::int64_t> y_dims = common::vectorize(y.dims());
-  std::vector<std::int64_t> dout_dims = common::vectorize(dout.dims());
-
-  int x_ndim = x_dims.size();
-  int y_ndim = y_dims.size();
-  int ndim = dout_dims.size();
-
-  // Case1 : x's or y's dim = 1
-  if (x_ndim == 1 && y_ndim == 1) {
-    DotDoubleGradFunction<Context, T>()(
-        dev_ctx, &x, &y, &dout, &ddx, &ddy, dx, dy, ddout);
-    return;
-  }
-
-  DenseTensor x_conj;
-  DenseTensor y_conj;
-  DenseTensor dout_conj;
-
-  bool is_broadcast = true;
-  if (x_ndim <= 2 || y_ndim <= 2) {
-    is_broadcast = false;
-  } else if (x_ndim != y_ndim) {
-    is_broadcast = true;
-  } else {
-    is_broadcast = !std::equal(
-        x_dims.cbegin(), x_dims.cbegin() + x_ndim - 2, y_dims.cbegin());
-  }
-
-  if (!is_broadcast) {
-    // Case2: no broadcast or no batch size
-    DenseTensor x_help = x;
-    DenseTensor y_help = y;
-    DenseTensor dout_help = dout;
-    ReshapeXYOutIntoMatrixSequence(
-        &x_help, &y_help, &dout_help, transpose_x, transpose_y);
-    DDim dx_dims;
-
-    if (dx) {
-      dx_dims = dx->dims();
-      if (dx_dims != x_help.dims()) {
-        dx->Resize(x_help.dims());
-      }
-    }
-
-    DDim dy_dims;
-    if (dy) {
-      dy_dims = dy->dims();
-      if (dy_dims != y_help.dims()) {
-        dy->Resize(y_help.dims());
-      }
-    }
-
-    DDim ddout_dims;
-    if (ddout) {
-      ddout_dims = ddout->dims();
-      if (ddout_dims != dout_help.dims()) {
-        ddout->Resize(dout_help.dims());
-      }
-
-      x_conj = Conj<T>(dev_ctx, x_help);
-      y_conj = Conj<T>(dev_ctx, y_help);
-    }
-
-    if (dx || dy) {
-      dout_conj = Conj<T>(dev_ctx, dout_help);
-    }
-
-    bool ddout_flag = false;
-    if (ddx) {
-      auto ddx_mat = ddx.get();
-      if (ddx_mat.dims() != x_help.dims()) {
-        ddx_mat.Resize(x_help.dims());
-      }
-      if (dy) {
-        if (transpose_x && transpose_y) {
-          // dy = dout' * ddx'
-          CalcInputGrad<T>(
-              dev_ctx, dout_conj, true, true, ddx_mat, true, false, dy, false);
-        } else if (transpose_x) {
-          // dy = ddx * dout
-          CalcInputGrad<T>(dev_ctx,
-                           ddx_mat,
-                           false,
-                           false,
-                           dout_conj,
-                           false,
-                           true,
-                           dy,
-                           false);
-        } else if (transpose_y) {
-          // dy = dout' * ddx
-          CalcInputGrad<T>(
-              dev_ctx, dout_conj, true, true, ddx_mat, false, true, dy, false);
-        } else {
-          // dy = ddx' * dout
-          CalcInputGrad<T>(
-              dev_ctx, ddx_mat, true, true, dout_conj, false, true, dy, false);
-        }
-      }
-
-      if (ddout) {
-        CalcInputGrad<T>(dev_ctx,
-                         ddx_mat,
-                         transpose_x,
-                         true,
-                         y_conj,
-                         transpose_y,
-                         false,
-                         ddout,
-                         ddout_flag);
-        ddout_flag = true;
-      }
-    } else if (!ddx && dy) {
-      FullLikeKernel<T, Context>(dev_ctx, y, Scalar(0.0), y.dtype(), dy);
-    }
-    if (ddy) {
-      auto ddy_mat = ddy.get();
-      if (ddy_mat.dims() != y_help.dims()) {
-        ddy_mat.Resize(y_help.dims());
-      }
-      if (dx) {
-        if (transpose_x && transpose_y) {
-          // dx = ddy' * dout'
-          CalcInputGrad<T>(
-              dev_ctx, ddy_mat, true, true, dout_conj, true, false, dx, false);
-        } else if (transpose_x) {
-          // dx = ddy * dout'
-          CalcInputGrad<T>(dev_ctx,
-                           ddy_mat,
-                           false,
-                           false,
-                           dout_conj,
-                           true,
-                           false,
-                           dx,
-                           false);
-        } else if (transpose_y) {
-          // dx = dout * ddy
-          CalcInputGrad<T>(dev_ctx,
-                           dout_conj,
-                           false,
-                           false,
-                           ddy_mat,
-                           false,
-                           true,
-                           dx,
-                           false);
-        } else {
-          // dx = dout * ddy'
-          CalcInputGrad<T>(dev_ctx,
-                           dout_conj,
-                           false,
-                           false,
-                           ddy_mat,
-                           true,
-                           false,
-                           dx,
-                           false);
-        }
-      }
-
-      if (ddout) {
-        CalcInputGrad<T>(dev_ctx,
-                         x_conj,
-                         transpose_x,
-                         true,
-                         ddy_mat,
-                         transpose_y,
-                         false,
-                         ddout,
-                         ddout_flag);
-      }
-    } else if (!ddy && dx) {
-      FullLikeKernel<T, Context>(dev_ctx, x, Scalar(0.0), x.dtype(), dx);
-    }
-    if (ddout && !ddx && !ddy) {
-      FullLikeKernel<T, Context>(
-          dev_ctx, dout, Scalar(0.0), dout.dtype(), ddout);
-    }
-
-    if (dx) {
-      if (dx_dims != x_help.dims()) {
-        dx->Resize(dx_dims);
-      }
-    }
-
-    if (dy) {
-      if (dy_dims != y_help.dims()) {
-        dy->Resize(dy_dims);
-      }
-    }
-
-    if (ddout) {
-      if (ddout_dims != dout_help.dims()) {
-        ddout->Resize(ddout_dims);
-      }
-    }
-  } else {
-    // Case3: broadcast. It need cost much time to reduce sum for the
-    // broadcast and wastes the memory.
-    // So we should avoid the case in reality.
-    VLOG(3) << "It need cost much time to reduce sum for the broadcast and "
-               "wastes the memory. So we should avoid the case in reality";
-    if (dx || dy) {
-      dout_conj = Conj<T>(dev_ctx, dout);
-    }
-    if (ddout) {
-      x_conj = Conj<T>(dev_ctx, x);
-      y_conj = Conj<T>(dev_ctx, y);
-    }
-
-    DenseTensor dx_help;
-    DenseTensor dy_help;
-
-    if (transpose_x) {
-      if (transpose_y) {
-        if (dx && ddy) {
-          MatMulFunction<Context, T>(dev_ctx,
-                                     ddy.get(),
-                                     dout_conj,
-                                     y_dims,
-                                     dout_dims,
-                                     &dx_help,
-                                     true,
-                                     true);
-        }
-        if (dy && ddx) {
-          MatMulFunction<Context, T>(dev_ctx,
-                                     dout_conj,
-                                     ddx.get(),
-                                     dout_dims,
-                                     x_dims,
-                                     &dy_help,
-                                     true,
-                                     true);
-        }
-      } else {
-        if (dx && ddy) {
-          MatMulFunction<Context, T>(dev_ctx,
-                                     ddy.get(),
-                                     dout_conj,
-                                     y_dims,
-                                     dout_dims,
-                                     &dx_help,
-                                     false,
-                                     true);
-        }
-        if (dy && ddx) {
-          MatMulFunction<Context, T>(dev_ctx,
-                                     ddx.get(),
-                                     dout_conj,
-                                     x_dims,
-                                     dout_dims,
-                                     &dy_help,
-                                     false,
-                                     false);
-        }
-      }
-    } else {
-      if (transpose_y) {
-        if (dx && ddy) {
-          MatMulFunction<Context, T>(dev_ctx,
-                                     dout_conj,
-                                     ddy.get(),
-                                     dout_dims,
-                                     y_dims,
-                                     &dx_help,
-                                     false,
-                                     false);
-        }
-        if (dy && ddx) {
-          MatMulFunction<Context, T>(dev_ctx,
-                                     dout_conj,
-                                     ddx.get(),
-                                     dout_dims,
-                                     x_dims,
-                                     &dy_help,
-                                     true,
-                                     false);
-        }
-      } else {
-        if (dx && ddy) {
-          MatMulFunction<Context, T>(dev_ctx,
-                                     dout_conj,
-                                     ddy.get(),
-                                     dout_dims,
-                                     y_dims,
-                                     &dx_help,
-                                     false,
-                                     true);
-        }
-        if (dy && ddx) {
-          MatMulFunction<Context, T>(dev_ctx,
-                                     ddx.get(),
-                                     dout_conj,
-                                     x_dims,
-                                     dout_dims,
-                                     &dy_help,
-                                     true,
-                                     false);
-        }
-      }
-    }
-
-    // get help dims
-    const std::vector<std::int64_t> dx_help_dims =
-        common::vectorize(dx_help.dims());
-    const std::vector<std::int64_t> dy_help_dims =
-        common::vectorize(dy_help.dims());
-
-    std::vector<std::int64_t> dx_broadcast_dims(ndim);
-    std::vector<std::int64_t> dy_broadcast_dims(ndim);
-
-    std::fill(
-        dx_broadcast_dims.data(), dx_broadcast_dims.data() + ndim - x_ndim, 1);
-    std::fill(
-        dy_broadcast_dims.data(), dy_broadcast_dims.data() + ndim - y_ndim, 1);
-    std::copy(x_dims.data(),
-              x_dims.data() + x_ndim,
-              dx_broadcast_dims.data() + ndim - x_ndim);
-    std::copy(y_dims.data(),
-              y_dims.data() + y_ndim,
-              dy_broadcast_dims.data() + ndim - y_ndim);
-
-    std::vector<int> dx_reduce_dims;
-    std::vector<int> dy_reduce_dims;
-    for (int idx = 0; idx <= ndim - 3; idx++) {
-      if (dx_help_dims[idx] != 1 && dx_broadcast_dims[idx] == 1) {
-        dx_reduce_dims.push_back(idx);
-      }
-      if (dy_help_dims[idx] != 1 && dy_broadcast_dims[idx] == 1) {
-        dy_reduce_dims.push_back(idx);
-      }
-    }
-    // Reduce sum to get grad by ReduceSum
-    if (dx && dx_help.initialized()) {
-      if (dx_reduce_dims.empty()) {
-        *dx = std::move(dx_help);
-      } else {
-        ReduceSumForMatmulGrad<Context, T>()(
-            dev_ctx, dx_help, dx, dx_reduce_dims);
-      }
-      dx->Resize(x.dims());
-    } else if (dx && !dx_help.initialized()) {
-      FullLikeKernel<T, Context>(dev_ctx, x, Scalar(0.0), x.dtype(), dx);
-    }
-    if (dy && dy_help.initialized()) {
-      if (dy_reduce_dims.empty()) {
-        *dy = std::move(dy_help);
-      } else {
-        ReduceSumForMatmulGrad<Context, T>()(
-            dev_ctx, dy_help, dy, dy_reduce_dims);
-      }
-      dy->Resize(y.dims());
-    } else if (dy && !dy_help.initialized()) {
-      FullLikeKernel<T, Context>(dev_ctx, y, Scalar(0.0), y.dtype(), dy);
-    }
-
-    if (ddout) {
-      // Calculate the gradient of OutputGrad(Out)
-      if (ddx) {
-        MatMulFunction<Context, T>(dev_ctx,
-                                   ddx.get(),
-                                   y_conj,
-                                   x_dims,
-                                   y_dims,
-                                   ddout,
-                                   transpose_x,
-                                   transpose_y);
-      }
-
-      if (ddy) {
-        MatMulFunction<Context, T>(dev_ctx,
-                                   x_conj,
-                                   ddy.get(),
-                                   x_dims,
-                                   y_dims,
-                                   ddout,
-                                   transpose_x,
-                                   transpose_y,
-                                   true);
-      }
-    }
-  }
-}
-
-template <typename T, typename Context>
-void MatmulTripleGradKernel(const Context& dev_ctx,
-                            const DenseTensor& x,
-                            const DenseTensor& y,
-                            const DenseTensor& dout,
-                            const paddle::optional<DenseTensor>& ddx,
-                            const paddle::optional<DenseTensor>& ddy,
-                            const paddle::optional<DenseTensor>& d_dx,
-                            const paddle::optional<DenseTensor>& d_dy,
-                            const paddle::optional<DenseTensor>& d_ddout,
-                            bool transpose_x,
-                            bool transpose_y,
-                            DenseTensor* out_d_x,
-                            DenseTensor* out_d_y,
-                            DenseTensor* out_d_dout,
-                            DenseTensor* out_d_ddx,
-                            DenseTensor* out_d_ddy) {
-  // Get dims from the input x, y, output_grad
-  std::vector<std::int64_t> x_dims = common::vectorize(x.dims());
-  std::vector<std::int64_t> y_dims = common::vectorize(y.dims());
-  std::vector<std::int64_t> dout_dims = common::vectorize(dout.dims());
-
-  int x_ndim = x_dims.size();
-  int y_ndim = y_dims.size();
-  int ndim = dout_dims.size();
-
-  // Case1 : x's and y's dim = 1
-  if (x_ndim == 1 && y_ndim == 1) {
-    VLOG(3) << "========  MatMulV2TripleGradKernel, Compute ====== Case 1";
-    DotTripleGradFunction<Context, T>()(dev_ctx,
-                                        &x,
-                                        &y,
-                                        &dout,
-                                        &ddx,
-                                        &ddy,
-                                        &d_dx,
-                                        &d_dy,
-                                        &d_ddout,
-                                        out_d_x,
-                                        out_d_y,
-                                        out_d_dout,
-                                        out_d_ddx,
-                                        out_d_ddy);
-    return;
-  }
-
-  DenseTensor x_conj;
-  DenseTensor y_conj;
-  DenseTensor dout_conj;
-  DenseTensor ddx_conj;
-  DenseTensor ddy_conj;
-
-  bool is_broadcast = true;
-  if (x_ndim <= 2 || y_ndim <= 2) {
-    is_broadcast = false;
-  } else if (x_ndim != y_ndim) {
-    is_broadcast = true;
-  } else {
-    is_broadcast = !std::equal(
-        x_dims.cbegin(), x_dims.cbegin() + x_ndim - 2, y_dims.cbegin());
-  }
-
-  if (!is_broadcast) {
-    // Case2: no broadcast or no batch size
-    VLOG(3) << "========  MatMulV2TripleGradKernel, Compute ====== Case 2";
-    DenseTensor x_help = x;
-    DenseTensor y_help = y;
-    DenseTensor dout_help = dout;
-
-    DenseTensor ddx_help;
-    DenseTensor ddy_help;
-    ReshapeXYOutIntoMatrixSequence(
-        &x_help, &y_help, &dout_help, transpose_x, transpose_y);
-    if (ddx) {
-      ddx_help = ddx.get();
-      if (ddx_help.dims() != x_help.dims()) {
-        ddx_help.Resize(x_help.dims());
-      }
-    }
-
-    if (ddy) {
-      ddy_help = ddy.get();
-      if (ddy_help.dims() != y_help.dims()) {
-        ddy_help.Resize(y_help.dims());
-      }
-    }
-
-    DDim out_dx_dims;
-    if (out_d_x) {
-      out_dx_dims = out_d_x->dims();
-      if (out_dx_dims != x_help.dims()) {
-        out_d_x->Resize(x_help.dims());
-      }
-      if (ddy) {
-        ddy_conj = Conj<T>(dev_ctx, ddy_help);
-      }
-    }
-    DDim out_dy_dims;
-    if (out_d_y) {
-      out_dy_dims = out_d_y->dims();
-      if (out_dy_dims != y_help.dims()) {
-        out_d_y->Resize(y_help.dims());
-      }
-      if (ddx) {
-        ddx_conj = Conj<T>(dev_ctx, ddx_help);
-      }
-    }
-    DDim out_d_dout_dims;
-    if (out_d_dout) {
-      out_d_dout_dims = out_d_dout->dims();
-      if (out_d_dout_dims != dout_help.dims()) {
-        out_d_dout->Resize(dout_help.dims());
-      }
-      if (ddx && !ddx_conj.IsInitialized()) {
-        ddx_conj = Conj<T>(dev_ctx, ddx_help);
-      }
-      if (ddy && !ddy_conj.IsInitialized()) {
-        ddy_conj = Conj<T>(dev_ctx, ddy_help);
-      }
-    }
-    DDim out_d_ddx_dims;
-    if (out_d_ddx) {
-      out_d_ddx_dims = out_d_ddx->dims();
-      if (out_d_ddx_dims != x_help.dims()) {
-        out_d_ddx->Resize(x_help.dims());
-      }
-      dout_conj = Conj<T>(dev_ctx, dout_help);
-      y_conj = Conj<T>(dev_ctx, y_help);
-    }
-    DDim out_d_ddy_dims;
-    if (out_d_ddy) {
-      out_d_ddy_dims = out_d_ddy->dims();
-      if (out_d_ddy_dims != y_help.dims()) {
-        out_d_ddy->Resize(y_help.dims());
-      }
-      if (!dout_conj.IsInitialized()) {
-        dout_conj = Conj<T>(dev_ctx, dout_help);
-      }
-      x_conj = Conj<T>(dev_ctx, x_help);
-    }
-
-    bool d_dout_flag = false;
-    bool d_ddx_flag = false;
-    bool d_ddy_flag = false;
-    if (d_ddout) {
-      auto d_ddout_mat = d_ddout.get();
-      if (d_ddout_mat.dims() != dout_help.dims()) {
-        d_ddout_mat.Resize(dout_help.dims());
-      }
-
-      if (out_d_y && ddx) {
-        if (transpose_x && transpose_y) {
-          // out_d_y = d_ddout' * ddx'
-          CalcInputGrad<T>(dev_ctx,
-                           d_ddout_mat,
-                           true,
-                           true,
-                           ddx_conj,
-                           true,
-                           false,
-                           out_d_y,
-                           false);
-        } else if (transpose_x) {
-          // out_d_y = ddx * d_ddout
-          CalcInputGrad<T>(dev_ctx,
-                           ddx_conj,
-                           false,
-                           false,
-                           d_ddout_mat,
-                           false,
-                           true,
-                           out_d_y,
-                           false);
-        } else if (transpose_y) {
-          // out_d_y = d_ddout' * ddx
-          CalcInputGrad<T>(dev_ctx,
-                           d_ddout_mat,
-                           true,
-                           true,
-                           ddx_conj,
-                           false,
-                           true,
-                           out_d_y,
-                           false);
-        } else {
-          // out_d_y = ddx' * d_ddout
-          CalcInputGrad<T>(dev_ctx,
-                           ddx_conj,
-                           true,
-                           true,
-                           d_ddout_mat,
-                           false,
-                           true,
-                           out_d_y,
-                           false);
-        }
-      } else if (out_d_y) {
-        FullLikeKernel<T, Context>(dev_ctx, y, Scalar(0.0), y.dtype(), out_d_y);
-      }
-      if (out_d_x && ddy) {
-        if (transpose_x && transpose_y) {
-          // out_d_x = ddy' * d_ddout'
-          CalcInputGrad<T>(dev_ctx,
-                           ddy_conj,
-                           true,
-                           true,
-                           d_ddout_mat,
-                           true,
-                           false,
-                           out_d_x,
-                           false);
-        } else if (transpose_x) {
-          // out_d_x = ddy * d_ddout'
-          CalcInputGrad<T>(dev_ctx,
-                           ddy_conj,
-                           false,
-                           false,
-                           d_ddout_mat,
-                           true,
-                           false,
-                           out_d_x,
-                           false);
-        } else if (transpose_y) {
-          // out_d_x = d_ddout * ddy
-          CalcInputGrad<T>(dev_ctx,
-                           d_ddout_mat,
-                           false,
-                           false,
-                           ddy_conj,
-                           false,
-                           true,
-                           out_d_x,
-                           false);
-        } else {
-          // out_d_x = d_ddout * ddy'
-          CalcInputGrad<T>(dev_ctx,
-                           d_ddout_mat,
-                           false,
-                           false,
-                           ddy_conj,
-                           true,
-                           false,
-                           out_d_x,
-                           false);
-        }
-      } else if (out_d_x) {
-        FullLikeKernel<T, Context>(dev_ctx, x, Scalar(0.0), x.dtype(), out_d_x);
-      }
-
-      // equations:
-      // d_ddx = DOut * D_DY + Y * D_DDOut
-      // Let: d_ddx1 = Y * D_DDOut
-      // Let: d_ddx2 = DOut * D_DY
-
-      // d_ddy = DOut * D_DX + X * D_DDOut
-      // Let: d_ddy1 = X * D_DDOut
-      // Let: d_ddy2 = DOut * D_DX
-
-      // d_dout = DDY * D_DX + DDX * D_DY
-      // Let: d_dout1 = DDX * D_DY
-      // Let: d_dout2 = DDY * D_DX
-
-      // compute d_ddx1
-      if (out_d_ddx) {
-        if (transpose_x && transpose_y) {
-          // out_d_ddx1 = y' * d_ddout'
-          CalcInputGrad<T>(dev_ctx,
-                           y_conj,
-                           true,
-                           true,
-                           d_ddout_mat,
-                           true,
-                           false,
-                           out_d_ddx,
-                           d_ddx_flag);
-        } else if (transpose_x) {
-          // out_d_ddx1 = y * d_ddout'
-          CalcInputGrad<T>(dev_ctx,
-                           y_conj,
-                           false,
-                           false,
-                           d_ddout_mat,
-                           true,
-                           false,
-                           out_d_ddx,
-                           d_ddx_flag);
-        } else if (transpose_y) {
-          // out_d_ddx1 = d_ddout * y
-          CalcInputGrad<T>(dev_ctx,
-                           d_ddout_mat,
-                           false,
-                           false,
-                           y_conj,
-                           false,
-                           true,
-                           out_d_ddx,
-                           d_ddx_flag);
-        } else {
-          // out_d_ddx1 = d_ddout * y'
-          CalcInputGrad<T>(dev_ctx,
-                           d_ddout_mat,
-                           false,
-                           false,
-                           y_conj,
-                           true,
-                           false,
-                           out_d_ddx,
-                           d_ddx_flag);
-        }
-        d_ddx_flag = true;
-      }
-
-      // compute d_ddy1
-      if (out_d_ddy) {
-        if (transpose_x && transpose_y) {
-          // out_d_ddy1 = d_ddout' * x'
-          CalcInputGrad<T>(dev_ctx,
-                           d_ddout_mat,
-                           true,
-                           true,
-                           x_conj,
-                           true,
-                           false,
-                           out_d_ddy,
-                           false);
-        } else if (transpose_x) {
-          // out_d_ddy1 = x * d_ddout
-          CalcInputGrad<T>(dev_ctx,
-                           x_conj,
-                           false,
-                           false,
-                           d_ddout_mat,
-                           false,
-                           true,
-                           out_d_ddy,
-                           false);
-        } else if (transpose_y) {
-          // out_d_ddy1 = d_ddout' * x
-          CalcInputGrad<T>(dev_ctx,
-                           d_ddout_mat,
-                           true,
-                           true,
-                           x_conj,
-                           false,
-                           true,
-                           out_d_ddy,
-                           false);
-        } else {
-          // out_d_ddy1 = x' * d_ddout
-          CalcInputGrad<T>(dev_ctx,
-                           x_conj,
-                           true,
-                           true,
-                           d_ddout_mat,
-                           false,
-                           true,
-                           out_d_ddy,
-                           false);
-        }
-        d_ddy_flag = true;
-      }
-    } else {
-      // d_ddout is none
-      if (out_d_x) {
-        FullLikeKernel<T, Context>(dev_ctx, x, Scalar(0.0), x.dtype(), out_d_x);
-      }
-
-      if (out_d_y) {
-        FullLikeKernel<T, Context>(dev_ctx, y, Scalar(0.0), y.dtype(), out_d_y);
-      }
-    }
-
-    if (d_dy) {
-      auto d_dy_mat = d_dy.get();
-      if (d_dy_mat.dims() != y_help.dims()) {
-        d_dy_mat.Resize(y_help.dims());
-      }
-
-      // compute d_dout1
-      if (out_d_dout && ddx) {
-        CalcInputGrad<T>(dev_ctx,
-                         ddx_conj,
-                         transpose_x,
-                         true,
-                         d_dy_mat,
-                         transpose_y,
-                         false,
-                         out_d_dout,
-                         d_dout_flag);
-        d_dout_flag = true;
-      }
-
-      // compute d_ddx2
-      if (out_d_ddx) {
-        if (transpose_x && transpose_y) {
-          // out_d_ddx2 = D_DY' * DOut'
-          CalcInputGrad<T>(dev_ctx,
-                           d_dy_mat,
-                           true,
-                           true,
-                           dout_conj,
-                           true,
-                           false,
-                           out_d_ddx,
-                           d_ddx_flag);
-        } else if (transpose_x) {
-          // out_d_ddx2 = D_DY * Dout'
-          CalcInputGrad<T>(dev_ctx,
-                           d_dy_mat,
-                           false,
-                           false,
-                           dout_conj,
-                           true,
-                           false,
-                           out_d_ddx,
-                           d_ddx_flag);
-        } else if (transpose_y) {
-          // out_d_ddx2 = Dout * D_DY
-          CalcInputGrad<T>(dev_ctx,
-                           dout_conj,
-                           false,
-                           false,
-                           d_dy_mat,
-                           false,
-                           true,
-                           out_d_ddx,
-                           d_ddx_flag);
-        } else {
-          // out_d_ddx2 = Dout * D_DY'
-          CalcInputGrad<T>(dev_ctx,
-                           dout_conj,
-                           false,
-                           false,
-                           d_dy_mat,
-                           true,
-                           false,
-                           out_d_ddx,
-                           d_ddx_flag);
-        }
-      }
-    }
-
-    if (d_dx) {
-      auto d_dx_mat = d_dx.get();
-      if (d_dx_mat.dims() != x_help.dims()) {
-        d_dx_mat.Resize(x_help.dims());
-      }
-
-      // compute d_dout2
-      if (out_d_dout && ddy) {
-        CalcInputGrad<T>(dev_ctx,
-                         d_dx_mat,
-                         transpose_x,
-                         true,
-                         ddy_conj,
-                         transpose_y,
-                         false,
-                         out_d_dout,
-                         d_dout_flag);
-      }
-
-      // compute d_ddy2
-      if (out_d_ddy) {
-        if (transpose_x && transpose_y) {
-          // out_d_ddy2 = dout' * d_dx'
-          CalcInputGrad<T>(dev_ctx,
-                           dout_conj,
-                           true,
-                           true,
-                           d_dx_mat,
-                           true,
-                           false,
-                           out_d_ddy,
-                           d_ddy_flag);
-        } else if (transpose_x) {
-          // out_d_ddy2 = d_dx * dout
-          CalcInputGrad<T>(dev_ctx,
-                           d_dx_mat,
-                           false,
-                           false,
-                           dout_conj,
-                           false,
-                           true,
-                           out_d_ddy,
-                           d_ddy_flag);
-        } else if (transpose_y) {
-          // out_d_ddy2 = dout' * d_dx
-          CalcInputGrad<T>(dev_ctx,
-                           dout_conj,
-                           true,
-                           true,
-                           d_dx_mat,
-                           false,
-                           true,
-                           out_d_ddy,
-                           d_ddy_flag);
-        } else {
-          // out_d_ddy2 = d_dx' * dout
-          CalcInputGrad<T>(dev_ctx,
-                           d_dx_mat,
-                           true,
-                           true,
-                           dout_conj,
-                           false,
-                           true,
-                           out_d_ddy,
-                           d_ddy_flag);
-        }
-      }
-    }
-
-    if (out_d_x) {
-      if (out_dx_dims != x_help.dims()) {
-        out_d_x->Resize(out_dx_dims);
-      }
-    }
-
-    if (out_d_y) {
-      if (out_dy_dims != y_help.dims()) {
-        out_d_y->Resize(out_dy_dims);
-      }
-    }
-
-    if (out_d_dout) {
-      if (out_d_dout_dims != dout_help.dims()) {
-        out_d_dout->Resize(out_d_dout_dims);
-      }
-    }
-
-    if (out_d_ddx) {
-      if (out_d_ddx_dims != x_help.dims()) {
-        out_d_ddx->Resize(out_d_ddx_dims);
-      }
-    }
-
-    if (out_d_ddy) {
-      if (out_d_ddy_dims != y_help.dims()) {
-        out_d_ddy->Resize(out_d_ddy_dims);
-      }
-    }
-
-    if (out_d_dout && !out_d_dout->IsInitialized()) {
-      FullLikeKernel<T, Context>(
-          dev_ctx, dout, Scalar(0.0), dout.dtype(), out_d_dout);
-    }
-
-    if (out_d_ddx && !out_d_ddx->IsInitialized()) {
-      FullLikeKernel<T, Context>(dev_ctx, x, Scalar(0.0), x.dtype(), out_d_ddx);
-    }
-
-    if (out_d_ddy && !out_d_ddy->IsInitialized()) {
-      FullLikeKernel<T, Context>(dev_ctx, y, Scalar(0.0), y.dtype(), out_d_ddy);
-    }
-  } else {
-    // Case3: broadcast. It need cost much time to reduce sum for the
-    // broadcast and wastes the memory.
-    // So we should avoid the case in reality.
-    VLOG(3) << "========  MatMulV2TripleGradKernel, Compute ====== Case 3";
-    VLOG(3) << "It need cost much time to reduce sum for the broadcast and "
-               "wastes the memory. So we should avoid the case in reality";
-
-    DenseTensor out_dx_help;
-    DenseTensor out_dy_help;
-    DenseTensor out_d_ddx_help;
-    DenseTensor out_d_ddy_help;
-
-    if (out_d_dout) {
-      if (ddx) {
-        ddx_conj = Conj<T>(dev_ctx, ddx.get());
-      }
-      if (ddy) {
-        ddy_conj = Conj<T>(dev_ctx, ddy.get());
-      }
-    }
-    if (out_d_ddx || out_d_ddy) {
-      x_conj = Conj<T>(dev_ctx, x);
-      y_conj = Conj<T>(dev_ctx, y);
-      dout_conj = Conj<T>(dev_ctx, dout);
-    }
-
-    if (transpose_x) {
-      if (transpose_y) {
-        // dX = ddY' d_ddout’, dY = d_ddout’ ddX'
-        if (out_d_x && ddy && d_ddout)
-          MatMulFunction<Context, T>(dev_ctx,
-                                     ddy_conj,
-                                     d_ddout.get(),
-                                     y_dims,
-                                     dout_dims,
-                                     &out_dx_help,
-                                     true,
-                                     true);
-        if (out_d_y && ddx && d_ddout)
-          MatMulFunction<Context, T>(dev_ctx,
-                                     d_ddout.get(),
-                                     ddx_conj,
-                                     dout_dims,
-                                     x_dims,
-                                     &out_dy_help,
-                                     true,
-                                     true);
-      } else {
-        // dX = ddY d_ddout', dY = ddX d_ddout
-        if (out_d_x && ddy && d_ddout)
-          MatMulFunction<Context, T>(dev_ctx,
-                                     ddy_conj,
-                                     d_ddout.get(),
-                                     y_dims,
-                                     dout_dims,
-                                     &out_dx_help,
-                                     false,
-                                     true);
-        if (out_d_y && ddx && d_ddout)
-          MatMulFunction<Context, T>(dev_ctx,
-                                     ddx_conj,
-                                     d_ddout.get(),
-                                     x_dims,
-                                     dout_dims,
-                                     &out_dy_help,
-                                     false,
-                                     false);
-      }
-
-    } else {
-      if (transpose_y) {
-        // dX = d_ddout ddY, dY = d_ddout’ ddX
-        if (out_d_x && ddy && d_ddout)
-          MatMulFunction<Context, T>(dev_ctx,
-                                     d_ddout.get(),
-                                     ddy_conj,
-                                     dout_dims,
-                                     y_dims,
-                                     &out_dx_help,
-                                     false,
-                                     false);
-        if (out_d_y && ddx && d_ddout)
-          MatMulFunction<Context, T>(dev_ctx,
-                                     d_ddout.get(),
-                                     ddx_conj,
-                                     dout_dims,
-                                     x_dims,
-                                     &out_dy_help,
-                                     true,
-                                     false);
-      } else {
-        // dX = d_ddout ddY', dY = ddX' d_ddout
-        if (out_d_x && ddy && d_ddout)
-          MatMulFunction<Context, T>(dev_ctx,
-                                     d_ddout.get(),
-                                     ddy_conj,
-                                     dout_dims,
-                                     y_dims,
-                                     &out_dx_help,
-                                     false,
-                                     true);
-        if (out_d_y && ddx && d_ddout)
-          MatMulFunction<Context, T>(dev_ctx,
-                                     ddx_conj,
-                                     d_ddout.get(),
-                                     x_dims,
-                                     dout_dims,
-                                     &out_dy_help,
-                                     true,
-                                     false);
-      }
-    }
-
-    // get help dims
-    const std::vector<std::int64_t> dx_help_dims =
-        common::vectorize(out_dx_help.dims());
-    const std::vector<std::int64_t> dy_help_dims =
-        common::vectorize(out_dx_help.dims());
-
-    std::vector<std::int64_t> dx_broadcast_dims(ndim);
-    std::vector<std::int64_t> dy_broadcast_dims(ndim);
-
-    std::fill(
-        dx_broadcast_dims.data(), dx_broadcast_dims.data() + ndim - x_ndim, 1);
-    std::fill(
-        dy_broadcast_dims.data(), dy_broadcast_dims.data() + ndim - y_ndim, 1);
-    std::copy(x_dims.data(),
-              x_dims.data() + x_ndim,
-              dx_broadcast_dims.data() + ndim - x_ndim);
-    std::copy(y_dims.data(),
-              y_dims.data() + y_ndim,
-              dy_broadcast_dims.data() + ndim - y_ndim);
-
-    std::vector<int> dx_reduce_dims;
-    std::vector<int> dy_reduce_dims;
-    for (int idx = 0; idx <= ndim - 3; idx++) {
-      if (dx_help_dims[idx] != 1 && dx_broadcast_dims[idx] == 1) {
-        dx_reduce_dims.push_back(idx);
-      }
-      if (dy_help_dims[idx] != 1 && dy_broadcast_dims[idx] == 1) {
-        dy_reduce_dims.push_back(idx);
-      }
-    }
-
-    // Reduce sum to get grad by ReduceSum
-    if (out_d_x && out_dx_help.initialized()) {
-      if (dx_reduce_dims.empty()) {
-        *out_d_x = std::move(out_dx_help);
-      } else {
-        ReduceSumForMatmulGrad<Context, T>()(
-            dev_ctx, out_dx_help, out_d_x, dx_reduce_dims);
-      }
-      out_d_x->Resize(x.dims());
-    } else if (out_d_x) {
-      FullLikeKernel<T, Context>(dev_ctx, x, Scalar(0.0), x.dtype(), out_d_x);
-    }
-
-    if (out_d_y && out_dy_help.initialized()) {
-      if (dy_reduce_dims.empty()) {
-        *out_d_y = std::move(out_dy_help);
-      } else {
-        ReduceSumForMatmulGrad<Context, T>()(
-            dev_ctx, out_dy_help, out_d_y, dy_reduce_dims);
-      }
-      out_d_y->Resize(y.dims());
-    } else if (out_d_y) {
-      FullLikeKernel<T, Context>(dev_ctx, y, Scalar(0.0), y.dtype(), out_d_y);
-    }
-
-    // compute d_dout
-    if (out_d_dout) {
-      if (d_dx && ddy) {
-        MatMulFunction<Context, T>(dev_ctx,
-                                   d_dx.get(),
-                                   ddy_conj,
-                                   x_dims,
-                                   y_dims,
-                                   out_d_dout,
-                                   transpose_x,
-                                   transpose_y);
-      }
-      if (d_dy && ddx) {
-        MatMulFunction<Context, T>(dev_ctx,
-                                   ddx_conj,
-                                   d_dy.get(),
-                                   x_dims,
-                                   y_dims,
-                                   out_d_dout,
-                                   transpose_x,
-                                   transpose_y,
-                                   true);
-      }
-
-      if (!out_d_dout->initialized()) {
-        FullLikeKernel<T, Context>(
-            dev_ctx, dout, Scalar(0.0), dout.dtype(), out_d_dout);
-      }
-    }
-
-    // compute d_ddx
-    if (out_d_ddx) {
-      if (transpose_x && transpose_y) {
-        // out_d_ddx1 = y' * d_ddout'
-        if (d_ddout) {
-          MatMulFunction<Context, T>(dev_ctx,
-                                     y_conj,
-                                     d_ddout.get(),
-                                     y_dims,
-                                     dout_dims,
-                                     &out_d_ddx_help,
-                                     true,
-                                     true);
-        }
-
-        // out_d_ddx2 = D_DY' * DOut'
-        if (d_dy) {
-          MatMulFunction<Context, T>(dev_ctx,
-                                     d_dy.get(),
-                                     dout_conj,
-                                     y_dims,
-                                     dout_dims,
-                                     &out_d_ddx_help,
-                                     true,
-                                     true,
-                                     true);
-        }
-
-      } else if (transpose_x) {
-        // out_d_ddx1 = y * d_ddout'
-        if (d_ddout) {
-          MatMulFunction<Context, T>(dev_ctx,
-                                     y_conj,
-                                     d_ddout.get(),
-                                     y_dims,
-                                     dout_dims,
-                                     &out_d_ddx_help,
-                                     false,
-                                     true);
-        }
-
-        // out_d_ddx2 = D_DY * Dout'
-        if (d_dy) {
-          MatMulFunction<Context, T>(dev_ctx,
-                                     d_dy.get(),
-                                     dout_conj,
-                                     y_dims,
-                                     dout_dims,
-                                     &out_d_ddx_help,
-                                     false,
-                                     true,
-                                     true);
-        }
-
-      } else if (transpose_y) {
-        // out_d_ddx1 = d_ddout * y
-        if (d_ddout) {
-          MatMulFunction<Context, T>(dev_ctx,
-                                     d_ddout.get(),
-                                     y_conj,
-                                     dout_dims,
-                                     y_dims,
-                                     &out_d_ddx_help,
-                                     false,
-                                     false);
-        }
-
-        // out_d_ddx2 = Dout * D_DY
-        if (d_dy) {
-          MatMulFunction<Context, T>(dev_ctx,
-                                     dout_conj,
-                                     d_dy.get(),
-                                     dout_dims,
-                                     y_dims,
-                                     &out_d_ddx_help,
-                                     false,
-                                     false,
-                                     true);
-        }
-      } else {
-        // out_d_ddx1 = d_ddout * y'
-        if (d_ddout) {
-          MatMulFunction<Context, T>(dev_ctx,
-                                     d_ddout.get(),
-                                     y_conj,
-                                     dout_dims,
-                                     y_dims,
-                                     &out_d_ddx_help,
-                                     false,
-                                     true);
-        }
-
-        // out_d_ddx2 = Dout * D_DY'
-        if (d_dy) {
-          MatMulFunction<Context, T>(dev_ctx,
-                                     dout_conj,
-                                     d_dy.get(),
-                                     dout_dims,
-                                     y_dims,
-                                     &out_d_ddx_help,
-                                     false,
-                                     true,
-                                     true);
-        }
-      }
-      if (out_d_ddx_help.initialized()) {
-        if (dx_reduce_dims.empty()) {
-          *out_d_ddx = std::move(out_d_ddx_help);
-        } else {
-          ReduceSumForMatmulGrad<Context, T>()(
-              dev_ctx, out_d_ddx_help, out_d_ddx, dx_reduce_dims);
-        }
-      } else {
-        FullLikeKernel<T, Context>(
-            dev_ctx, x, Scalar(0.0), x.dtype(), out_d_ddx);
-      }
-
-      out_d_ddx->Resize(x.dims());
-    }
-
-    // compute d_ddy
-    if (out_d_ddy) {
-      if (transpose_x && transpose_y) {
-        // out_d_ddy1 = d_ddout' * x'
-        if (d_ddout) {
-          MatMulFunction<Context, T>(dev_ctx,
-                                     d_ddout.get(),
-                                     x_conj,
-                                     dout_dims,
-                                     x_dims,
-                                     &out_d_ddy_help,
-                                     true,
-                                     true);
-        }
-
-        // out_d_ddy2 = dout' * d_dx'
-        if (d_dx) {
-          MatMulFunction<Context, T>(dev_ctx,
-                                     dout_conj,
-                                     d_dx.get(),
-                                     dout_dims,
-                                     x_dims,
-                                     &out_d_ddy_help,
-                                     true,
-                                     true,
-                                     true);
-        }
-
-      } else if (transpose_x) {
-        // out_d_ddy1 = x * d_ddout
-        if (d_ddout) {
-          MatMulFunction<Context, T>(dev_ctx,
-                                     x_conj,
-                                     d_ddout.get(),
-                                     x_dims,
-                                     dout_dims,
-                                     &out_d_ddy_help,
-                                     false,
-                                     false);
-        }
-
-        // out_d_ddy2 = d_dx * dout
-        if (d_dx) {
-          MatMulFunction<Context, T>(dev_ctx,
-                                     d_dx.get(),
-                                     dout_conj,
-                                     x_dims,
-                                     dout_dims,
-                                     &out_d_ddy_help,
-                                     false,
-                                     false,
-                                     true);
-        }
-
-      } else if (transpose_y) {
-        // out_d_ddy1 = d_ddout' * x
-        if (d_ddout) {
-          MatMulFunction<Context, T>(dev_ctx,
-                                     d_ddout.get(),
-                                     x_conj,
-                                     dout_dims,
-                                     x_dims,
-                                     &out_d_ddy_help,
-                                     true,
-                                     false);
-        }
-
-        // out_d_ddy2 = dout' * d_dx
-        if (d_dx) {
-          MatMulFunction<Context, T>(dev_ctx,
-                                     dout_conj,
-                                     d_dx.get(),
-                                     dout_dims,
-                                     x_dims,
-                                     &out_d_ddy_help,
-                                     true,
-                                     false,
-                                     true);
-        }
-
-      } else {
-        // out_d_ddy1 = x' * d_ddout
-        if (d_ddout) {
-          MatMulFunction<Context, T>(dev_ctx,
-                                     x_conj,
-                                     d_ddout.get(),
-                                     x_dims,
-                                     dout_dims,
-                                     &out_d_ddy_help,
-                                     true,
-                                     false);
-        }
-
-        // out_d_ddy2 = d_dx' * dout
-        if (d_dx) {
-          MatMulFunction<Context, T>(dev_ctx,
-                                     d_dx.get(),
-                                     dout_conj,
-                                     x_dims,
-                                     dout_dims,
-                                     &out_d_ddy_help,
-                                     true,
-                                     false,
-                                     true);
-        }
-      }
-
-      if (out_d_ddy_help.initialized()) {
-        if (dy_reduce_dims.empty()) {
-          *out_d_ddy = std::move(out_d_ddy_help);
-        } else {
-          ReduceSumForMatmulGrad<Context, T>()(
-              dev_ctx, out_d_ddy_help, out_d_ddy, dy_reduce_dims);
-        }
-      } else {
-        FullLikeKernel<T, Context>(
-            dev_ctx, y, Scalar(0.0), y.dtype(), out_d_ddy);
-      }
-
-      out_d_ddy->Resize(y.dims());
-    }
-  }
-}
-
-template <typename T, typename Context>
-void MatmulWithFlattenGradKernel(const Context& dev_ctx,
-                                 const DenseTensor& x,
-                                 const DenseTensor& y,
-                                 const DenseTensor& out_grad,
-                                 int x_num_col_dims,
-                                 int y_num_col_dims,
-                                 DenseTensor* x_grad,
-                                 DenseTensor* y_grad) {
-  auto x_matrix =
-      x.dims().size() > 2 ? phi::ReshapeToMatrix(x, x_num_col_dims) : x;
-  auto y_matrix =
-      y.dims().size() > 2 ? phi::ReshapeToMatrix(y, y_num_col_dims) : y;
-  auto* dout = &out_grad;
-
-  DenseTensor dout_mat(*dout);
-  dout_mat.Resize({common::flatten_to_2d(x.dims(), x_num_col_dims)[0],
-                   common::flatten_to_2d(y.dims(), y_num_col_dims)[1]});
-
-  auto* dx = x_grad;
-  auto* dy = y_grad;
-
-  if (dx != nullptr) {
-    dx->set_lod(x.lod());
-  }
-  if (dy != nullptr) {
-    dy->set_lod(y.lod());
-  }
-
-  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
-  if (dx) {
-    dev_ctx.template Alloc<T>(dx);
-    DenseTensor dx_matrix =
-        dx->dims().size() > 2 ? phi::ReshapeToMatrix(*dx, x_num_col_dims) : *dx;
-
-    // dx = dout * y'. dx: M x K, dout : M x N, y : K x N
-    blas.MatMul(dout_mat, false, y_matrix, true, &dx_matrix);
-  }
-  if (dy) {
-    dev_ctx.template Alloc<T>(dy);
-    DenseTensor dy_matrix =
-        dy->dims().size() > 2 ? phi::ReshapeToMatrix(*dy, y_num_col_dims) : *dy;
-    // dy = x' * dout. dy K x N, dout : M x N, x : M x K
-    blas.MatMul(x_matrix, true, dout_mat, false, &dy_matrix);
-  }
-}
-
-template <typename T, typename Context>
-void MatmulWithFlattenDoubleGradKernel(
-    const Context& dev_ctx,
-    const DenseTensor& x,
-    const DenseTensor& y,
-    const DenseTensor& out_grad,
-    const paddle::optional<DenseTensor>& x_grad_grad,
-    const paddle::optional<DenseTensor>& y_grad_grad,
-    int x_num_col_dims,
-    int y_num_col_dims,
-    DenseTensor* x_grad,
-    DenseTensor* y_grad,
-    DenseTensor* out_grad_grad) {
-  auto x_mat =
-      x.dims().size() > 2 ? phi::ReshapeToMatrix(x, x_num_col_dims) : x;
-  auto y_mat =
-      y.dims().size() > 2 ? phi::ReshapeToMatrix(y, y_num_col_dims) : y;
-
-  const int m = common::flatten_to_2d(x.dims(), x_num_col_dims)[0];
-  const int n = common::flatten_to_2d(y.dims(), y_num_col_dims)[1];
-
-  auto* dout = &out_grad;
-  DenseTensor dout_mat(*dout);
-  dout_mat.Resize({m, n});
-
-  auto* ddx = x_grad_grad.get_ptr();
-  auto* ddy = y_grad_grad.get_ptr();
-
-  auto* dx = x_grad;
-  auto* dy = y_grad;
-  auto* ddout = out_grad_grad;
-
-  DenseTensor ddout_mat;
-  if (ddout) {
-    ddout->set_lod(dout->lod());
-    // allocate and reshape ddout
-    dev_ctx.template Alloc<T>(ddout);
-    ddout_mat.ShareDataWith(*ddout);
-    ddout_mat.Resize({m, n});
-  }
-
-  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
-  // a flag to specify whether ddout value has been set, if flag
-  // is false, MatMul beta should be 0 to set ddout, if flag is
-  // true, MatMul beta should be 1 to add result to ddout.
-  bool ddout_flag = false;
-  if (ddx) {
-    auto ddx_mat = ddx->dims().size() > 2
-                       ? phi::ReshapeToMatrix(*ddx, x_num_col_dims)
-                       : static_cast<const DenseTensor&>(*ddx);
-
-    // dy = ddx' * dout. dy : K x M, ddx' : K x M, dout : M x N
-    if (dy) {
-      dy->set_lod(y.lod());
-      // allocate and reshape dy
-      dev_ctx.template Alloc<T>(dy);
-      DenseTensor dy_mat = dy->dims().size() > 2
-                               ? phi::ReshapeToMatrix(*dy, y_num_col_dims)
-                               : *dy;
-      blas.MatMul(ddx_mat, true, dout_mat, false, &dy_mat);
-    }
-    // ddout1 = ddx * y. ddx : M x K, y : K x N, ddout1 : M x N
-    if (ddout) {
-      blas.MatMul(ddx_mat,
-                  false,
-                  y_mat,
-                  false,
-                  static_cast<T>(1.0),
-                  &ddout_mat,
-                  static_cast<T>(ddout_flag));
-      ddout_flag = true;
-    }
-  }
-  if (ddy) {
-    auto ddy_mat = ddy->dims().size() > 2
-                       ? phi::ReshapeToMatrix(*ddy, y_num_col_dims)
-                       : static_cast<const DenseTensor&>(*ddy);
-    // dx = dout * ddy'. dout : M x N, ddy' : N x K, dx : M x K
-    if (dx) {
-      dx->set_lod(x.lod());
-      // allocate and reshape dx
-      dev_ctx.template Alloc<T>(dx);
-      DenseTensor dx_mat = dx->dims().size() > 2
-                               ? phi::ReshapeToMatrix(*dx, x_num_col_dims)
-                               : *dx;
-      blas.MatMul(dout_mat, false, ddy_mat, true, &dx_mat);
-    }
-    // ddout2 = x * ddy. x : M x K, ddy : K x N, ddout2 : M x N
-    if (ddout) {
-      blas.MatMul(x_mat,
-                  false,
-                  ddy_mat,
-                  false,
-                  static_cast<T>(1.0),
-                  &ddout_mat,
-                  static_cast<T>(ddout_flag));
-    }
-  }
-}
-template <typename T, typename Context>
-void LegacyMatmulGradKernel(const Context& dev_ctx,
-                            const DenseTensor& x,
-                            const DenseTensor& y,
-                            const DenseTensor& out_grad,
-                            bool transpose_x,
-                            bool transpose_y,
-                            float alpha,
-                            DenseTensor* dx,
-                            DenseTensor* dy) {
-  MatmulGradKernel<T, Context>(
-      dev_ctx, x, y, out_grad, transpose_x, transpose_y, dx, dy);
-  if (std::fabs(alpha - 1.f) > 1e-6f) {
-    ScaleKernel<T, Context>(dev_ctx, *dx, Scalar(alpha), Scalar(0), false, dx);
-    ScaleKernel<T, Context>(dev_ctx, *dy, Scalar(alpha), Scalar(0), false, dy);
-  }
-}
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/impl/matmul_kernel_impl.h b/backends/metax_gpu/kernels/impl/matmul_kernel_impl.h
deleted file mode 100755
index 5221bd93ba9..00000000000
--- a/backends/metax_gpu/kernels/impl/matmul_kernel_impl.h
+++ /dev/null
@@ -1,1717 +0,0 @@
-// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights
-// Reserved.
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-// clang-format off
-#include "glog/logging.h"
-
-#include "paddle/phi/common/memory_utils.h"
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/kernels/autotune/cache_base.h"
-#include "paddle/phi/kernels/cast_kernel.h"
-#include "../funcs/blas/blas.h"
-#ifdef PADDLE_WITH_HIP
-#include "paddle/phi/kernels/funcs/blas/blaslt_impl.hip.h"
-#else
-#include "../funcs/blas/blaslt_impl.cu.h"
-#endif
-#include "paddle/phi/kernels/funcs/complex_functors.h"
-#include "paddle/phi/kernels/scale_kernel.h"
-#if defined(PADDLE_WITH_CUDA)
-// #include "paddle/phi/kernels/funcs/cublaslt.h"
-#include "paddle/phi/kernels/gpu/cuda_gemm_kernel.h"
-#include "paddle/phi/kernels/transpose_kernel.h"
-#elif defined(PADDLE_WITH_HIP)
-#include "paddle/phi/kernels/funcs/hipblaslt.h"
-#endif
-#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060 && 0
-#include "paddle/phi/kernels/autotune/auto_tune_base.h"
-#endif
-#include "paddle/phi/kernels/full_kernel.h"
-// clang-format on
-namespace phi {
-
-static void GetBroadcastFromDims(const int x_ndim,
-                                 const std::int64_t* x_dims,
-                                 const int y_ndim,
-                                 const std::int64_t* y_dims,
-                                 std::int64_t* x_bd_dims,
-                                 std::int64_t* y_bd_dims,
-                                 std::int64_t* out_bd_dims) {
-  const int ndim = (std::max)(x_ndim, y_ndim);
-  std::fill(x_bd_dims, x_bd_dims + ndim - x_ndim, 1);
-  std::fill(y_bd_dims, y_bd_dims + ndim - y_ndim, 1);
-  std::copy(x_dims, x_dims + x_ndim, x_bd_dims + ndim - x_ndim);
-  std::copy(y_dims, y_dims + y_ndim, y_bd_dims + ndim - y_ndim);
-
-  for (int i = 0; i < ndim; ++i) {
-    PADDLE_ENFORCE_EQ(
-        x_bd_dims[i] == y_bd_dims[i] || x_bd_dims[i] <= 1 || y_bd_dims[i] <= 1,
-        true,
-        phi::errors::InvalidArgument(
-            "Input(X) and Input(Y) has error dim. "
-            "X_broadcast's shape[%s] must be equal to Y_broadcast's shape[%s], "
-            "or X_broadcast's shape[%s] <= 1, or Y_broadcast's shape[%s] <= 1, "
-            "but received X_broadcast's shape[%s] = [%s]"
-            "received Y_broadcast's shape[%s] = [%s].",
-            i,
-            i,
-            i,
-            i,
-            i,
-            x_bd_dims[i],
-            i,
-            y_bd_dims[i]));
-    if (x_bd_dims[i] == 0 || y_bd_dims[i] == 0) {
-      out_bd_dims[i] = 0;
-    } else {
-      out_bd_dims[i] = (std::max)(x_bd_dims[i], y_bd_dims[i]);
-    }
-  }
-}
-
-static int64_t GetIndexMessage(const int n,
-                               const int64_t* dims,
-                               const int64_t* index) {
-  int64_t sum = 0;
-  for (int i = 0; i < n; ++i) {
-    if (dims[i] > 1) {
-      sum = sum * dims[i] + index[i];
-    }
-  }
-  return sum;
-}
-
-static void IndexIncreaseFromDims(const int ndim,
-                                  const int64_t* dims,
-                                  int64_t* index) {
-  for (int i = ndim - 1; i >= 0; --i) {
-    ++index[i];
-    if (index[i] >= dims[i]) {
-      index[i] -= dims[i];
-    } else {
-      break;
-    }
-  }
-}
-
-// The general implementation with blas.
-template <typename Context, typename T>
-void MatMulFunctionImplWithBlas(
-    const Context& dev_ctx,
-    const DenseTensor& X,
-    const DenseTensor& Y,
-    const std::vector<std::int64_t>& x_dims,
-    const std::vector<std::int64_t>& y_dims,
-    DenseTensor* Out,
-    bool trans_x,
-    bool trans_y,
-    bool flag = false,
-    phi::funcs::MatmulPlanner* matmul_planner UNUSED = nullptr) {
-  const int x_ndim = x_dims.size();
-  const int y_ndim = y_dims.size();
-
-  // Get data ptr
-  const T* x_data = X.data<T>();
-  const T* y_data = Y.data<T>();
-
-  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
-
-  if (x_ndim == 1 && y_ndim == 1) {
-    const int M = X.numel();
-    const int N = Y.numel();
-    PADDLE_ENFORCE_EQ(
-        M,
-        N,
-        phi::errors::InvalidArgument(
-            "X's numbers must be equal to Y's numbers, "
-            "when X/Y's dims =1. But received X has [%d] elements, "
-            "received Y has [%d] elements.",
-            M,
-            N));
-    VLOG(3) << "MatMul's case 1";
-    Out->Resize(common::make_ddim({}));
-    dev_ctx.template Alloc<T>(Out);
-    blas.GEMM(CblasNoTrans,
-              CblasTrans,
-              1,
-              1,
-              M,
-              static_cast<T>(1),
-              y_data,
-              x_data,
-              static_cast<T>(flag),
-              dev_ctx.template Alloc<T>(Out));
-    return;
-  }
-
-  if (x_ndim == 1) {
-    const int N = X.numel();
-    if (trans_y) {
-      PADDLE_ENFORCE_EQ(
-          y_dims[y_ndim - 1],
-          N,
-          phi::errors::InvalidArgument("Input(Y) has error dim. "
-                                       "Y'dims[%d] must be equal to %d, "
-                                       "but received Y'dims[%d] is %d.",
-                                       y_ndim - 1,
-                                       N,
-                                       y_ndim - 1,
-                                       y_dims[y_ndim - 1]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          y_dims[y_ndim - 2],
-          N,
-          phi::errors::InvalidArgument("Input(Y) has error dim. "
-                                       "Y'dims[%d] must be equal to %d, "
-                                       "but received Y'dims[%d] is %d.",
-                                       y_ndim - 2,
-                                       N,
-                                       y_ndim - 2,
-                                       y_dims[y_ndim - 2]));
-    }
-    std::vector<std::int64_t> out_dims(y_ndim - 1);
-    if (trans_y) {
-      std::copy_n(y_dims.cbegin(), y_ndim - 1, out_dims.begin());
-    } else {
-      std::copy_n(y_dims.cbegin(), y_ndim - 2, out_dims.begin());
-      out_dims.back() = y_dims.back();
-    }
-    Out->ResizeAndAllocate(common::make_ddim(out_dims));
-    dev_ctx.template Alloc<T>(Out);
-    if (trans_y) {
-      const int M = Y.numel() / N;
-      VLOG(3) << "MatMul's case 2";
-      blas.GEMV(false,
-                M,
-                N,
-                static_cast<T>(1),
-                y_data,
-                x_data,
-                static_cast<T>(flag),
-                dev_ctx.template Alloc<T>(Out));
-    } else {
-      const int M = y_dims[y_ndim - 1];
-      const int batch_size = Y.numel() / (M * N);
-      if (batch_size == 1) {
-        VLOG(3) << "MatMul's case 3";
-        blas.GEMV(true,
-                  N,
-                  M,
-                  static_cast<T>(1),
-                  y_data,
-                  x_data,
-                  static_cast<T>(flag),
-                  dev_ctx.template Alloc<T>(Out));
-      } else {
-        VLOG(3) << "MatMul's case 4";
-        blas.BatchedGEMM(CblasTrans,
-                         CblasNoTrans,
-                         M,
-                         1,
-                         N,
-                         static_cast<T>(1),
-                         y_data,
-                         x_data,
-                         static_cast<T>(flag),
-                         dev_ctx.template Alloc<T>(Out),
-                         batch_size,
-                         M * N,
-                         0);
-      }
-    }
-    return;
-  }
-
-  if (y_ndim == 1) {
-    const int N = Y.numel();
-    if (trans_x) {
-      PADDLE_ENFORCE_EQ(
-          x_dims[x_ndim - 2],
-          N,
-          phi::errors::InvalidArgument("Input(X) has error dim."
-                                       "X'dims[%d] must be equal to %d"
-                                       "But received X'dims[%d] is %d",
-                                       x_ndim - 2,
-                                       N,
-                                       x_ndim - 2,
-                                       x_dims[x_ndim - 2]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          x_dims[x_ndim - 1],
-          N,
-          phi::errors::InvalidArgument("Input(X) has error dim."
-                                       "X'dims[%d] must be equal to %d"
-                                       "But received X'dims[%d] is %d",
-                                       x_ndim - 1,
-                                       N,
-                                       x_ndim - 1,
-                                       x_dims[x_ndim - 1]));
-    }
-    std::vector<std::int64_t> out_dims(x_ndim - 1);
-    if (trans_x) {
-      std::copy_n(x_dims.cbegin(), x_ndim - 2, out_dims.begin());
-      out_dims.back() = x_dims.back();
-    } else {
-      std::copy_n(x_dims.cbegin(), x_ndim - 1, out_dims.begin());
-    }
-    Out->ResizeAndAllocate(common::make_ddim(out_dims));
-    dev_ctx.template Alloc<T>(Out);
-
-    if (trans_x) {
-      const int M = x_dims[x_ndim - 1];
-      const int batch_size = X.numel() / (M * N);
-      if (batch_size == 1) {
-        VLOG(3) << "MatMul's case 5";
-        blas.GEMV(true,
-                  N,
-                  M,
-                  static_cast<T>(1),
-                  x_data,
-                  y_data,
-                  static_cast<T>(flag),
-                  dev_ctx.template Alloc<T>(Out));
-      } else {
-        VLOG(3) << "MatMul's case 6";
-        blas.BatchedGEMM(CblasTrans,
-                         CblasNoTrans,
-                         M,
-                         1,
-                         N,
-                         static_cast<T>(1),
-                         x_data,
-                         y_data,
-                         static_cast<T>(flag),
-                         dev_ctx.template Alloc<T>(Out),
-                         batch_size,
-                         M * N,
-                         0);
-      }
-    } else {
-      const int M = X.numel() / N;
-      VLOG(3) << "MatMul's case 7";
-      blas.GEMV(false,
-                M,
-                N,
-                static_cast<T>(1),
-                x_data,
-                y_data,
-                static_cast<T>(flag),
-                dev_ctx.template Alloc<T>(Out));
-    }
-    return;
-  }
-
-  const int M = trans_x ? x_dims[x_ndim - 1] : x_dims[x_ndim - 2];
-  const int K = trans_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1];
-  if (trans_y) {
-    PADDLE_ENFORCE_EQ(
-        y_dims[y_ndim - 1],
-        K,
-        phi::errors::InvalidArgument("Input(Y) has error dim. "
-                                     "Y'dims[%d] must be equal to %d, "
-                                     "but received Y'dims[%d] is %d.",
-                                     y_ndim - 1,
-                                     K,
-                                     y_ndim - 1,
-                                     y_dims[y_ndim - 1]));
-  } else {
-    PADDLE_ENFORCE_EQ(
-        y_dims[y_ndim - 2],
-        K,
-        phi::errors::InvalidArgument("Input(Y) has error dim. "
-                                     "Y'dims[%d] must be equal to %d, "
-                                     "but received Y'dims[%d] is %d.",
-                                     y_ndim - 2,
-                                     K,
-                                     y_ndim - 2,
-                                     y_dims[y_ndim - 2]));
-  }
-  const int N = trans_y ? y_dims[y_ndim - 2] : y_dims[y_ndim - 1];
-  const int ndim = (std::max)(x_ndim, y_ndim);
-  std::vector<std::int64_t> x_broadcast_dims(ndim);
-  std::vector<std::int64_t> y_broadcast_dims(ndim);
-  std::vector<std::int64_t> out_broadcast_dims(ndim);
-
-  GetBroadcastFromDims(x_ndim - 2,
-                       x_dims.data(),
-                       y_ndim - 2,
-                       y_dims.data(),
-                       x_broadcast_dims.data(),
-                       y_broadcast_dims.data(),
-                       out_broadcast_dims.data());
-  out_broadcast_dims[ndim - 2] = M;
-  out_broadcast_dims[ndim - 1] = N;
-
-  Out->ResizeAndAllocate(common::make_ddim(out_broadcast_dims));
-  dev_ctx.template Alloc<T>(Out);
-
-  const int batch_dim = ndim - 2;
-  // broadcast message
-  const bool is_broadcast_dims =
-      !std::equal(x_broadcast_dims.cbegin(),
-                  x_broadcast_dims.cbegin() + batch_dim,
-                  y_broadcast_dims.cbegin());
-
-  const std::int64_t x_batch_size =
-      std::accumulate(x_broadcast_dims.cbegin(),
-                      x_broadcast_dims.cbegin() + batch_dim,
-                      1LL,
-                      std::multiplies<std::int64_t>());
-  const std::int64_t y_batch_size =
-      std::accumulate(y_broadcast_dims.cbegin(),
-                      y_broadcast_dims.cbegin() + batch_dim,
-                      1LL,
-                      std::multiplies<std::int64_t>());
-  const std::int64_t out_batch_size =
-      std::accumulate(out_broadcast_dims.cbegin(),
-                      out_broadcast_dims.cbegin() + batch_dim,
-                      1LL,
-                      std::multiplies<std::int64_t>());
-  if (out_batch_size == 0) return;
-  if (x_batch_size == 1 && y_batch_size == 1) {
-    VLOG(3) << "MatMul's case 8";
-    blas.GEMM(trans_x ? CblasTrans : CblasNoTrans,
-              trans_y ? CblasTrans : CblasNoTrans,
-              M,
-              N,
-              K,
-              static_cast<T>(1),
-              x_data,
-              y_data,
-              static_cast<T>(flag),
-              dev_ctx.template Alloc<T>(Out));
-  } else if (x_batch_size == 1) {
-    if (M == 1 && trans_y) {
-      VLOG(3) << "MatMul's case 9";
-      blas.GEMV(false,
-                y_batch_size * N,
-                K,
-                static_cast<T>(1),
-                y_data,
-                x_data,
-                static_cast<T>(flag),
-                dev_ctx.template Alloc<T>(Out));
-    } else {
-      VLOG(3) << "MatMul's case 10";
-      blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
-                       trans_y ? CblasTrans : CblasNoTrans,
-                       M,
-                       N,
-                       K,
-                       static_cast<T>(1),
-                       x_data,
-                       y_data,
-                       static_cast<T>(flag),
-                       dev_ctx.template Alloc<T>(Out),
-                       out_batch_size,
-                       0,
-                       K * N);
-    }
-  } else if (y_batch_size == 1) {
-    if (!trans_x) {
-      VLOG(3) << "MatMul's case 11";
-      blas.GEMM(CblasNoTrans,
-                trans_y ? CblasTrans : CblasNoTrans,
-                x_batch_size * M,
-                N,
-                K,
-                static_cast<T>(1),
-                x_data,
-                y_data,
-                static_cast<T>(flag),
-                dev_ctx.template Alloc<T>(Out));
-    } else {
-      VLOG(3) << "MatMul's case 12";
-      blas.BatchedGEMM(CblasTrans,
-                       trans_y ? CblasTrans : CblasNoTrans,
-                       M,
-                       N,
-                       K,
-                       static_cast<T>(1),
-                       x_data,
-                       y_data,
-                       static_cast<T>(flag),
-                       dev_ctx.template Alloc<T>(Out),
-                       out_batch_size,
-                       M * K,
-                       0);
-    }
-  } else if (!is_broadcast_dims) {
-    VLOG(3) << "MatMul's case 13";
-    blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
-                     trans_y ? CblasTrans : CblasNoTrans,
-                     M,
-                     N,
-                     K,
-                     static_cast<T>(1),
-                     x_data,
-                     y_data,
-                     static_cast<T>(flag),
-                     dev_ctx.template Alloc<T>(Out),
-                     out_batch_size,
-                     M * K,
-                     K * N);
-  } else {
-    // in the case, can't use stridedgemm
-    std::vector<const T*> x_ptr(out_batch_size);
-    std::vector<const T*> y_ptr(out_batch_size);
-    std::vector<T*> out_ptr(out_batch_size);
-    std::vector<std::int64_t> index(batch_dim, 0);
-    for (std::int64_t i = 0; i < out_batch_size; ++i) {
-      // using the index to get offset
-      const std::int64_t x_index =
-          GetIndexMessage(batch_dim, x_broadcast_dims.data(), index.data());
-      const std::int64_t y_index =
-          GetIndexMessage(batch_dim, y_broadcast_dims.data(), index.data());
-
-      x_ptr[i] = x_data + x_index * M * K;
-      y_ptr[i] = y_data + y_index * K * N;
-      out_ptr[i] = dev_ctx.template Alloc<T>(Out) + i * M * N;
-      IndexIncreaseFromDims(batch_dim, out_broadcast_dims.data(), index.data());
-    }
-    VLOG(3) << "MatMul's case 14";
-    blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
-                     trans_y ? CblasTrans : CblasNoTrans,
-                     M,
-                     N,
-                     K,
-                     static_cast<T>(1),
-                     x_ptr.data(),
-                     y_ptr.data(),
-                     static_cast<T>(flag),
-                     out_ptr.data(),
-                     out_batch_size);
-  }
-}
-
-#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060 && 0
-// This is almost a copy from MatMulFunctionImplWithBlas,
-// compare cublas with cublasLt kernels when Matmul autotune is on
-template <typename Context, typename T>
-void MatMulFunctionImplWithCublasLt(
-    const Context& dev_ctx,
-    const DenseTensor& X,
-    const DenseTensor& Y,
-    const std::vector<std::int64_t>& x_dims,
-    const std::vector<std::int64_t>& y_dims,
-    DenseTensor* Out,
-    bool trans_x,
-    bool trans_y,
-    bool flag = false,
-    phi::funcs::MatmulPlanner* matmul_planner = nullptr) {
-  const int x_ndim = x_dims.size();
-  const int y_ndim = y_dims.size();
-  const T* x_data = X.data<T>();
-  const T* y_data = Y.data<T>();
-  using blaslt = phi::funcs::MatmulWithCublasLt<T>;
-
-  if (x_ndim == 1 && y_ndim == 1) {
-    const int M = X.numel();
-    const int N = Y.numel();
-    PADDLE_ENFORCE_EQ(
-        M,
-        N,
-        phi::errors::InvalidArgument(
-            "X's numbers must be equal to Y's numbers,"
-            "when X/Y's dims =1. But received X has [%d] elements,"
-            "received Y has [%d] elements",
-            M,
-            N));
-
-    // MatMul's case 0  =>  vector * vector
-    Out->Resize(common::make_ddim({}));
-    dev_ctx.template Alloc<T>(Out);
-    VLOG(3) << "MatMul with blaslt case 1";
-    blaslt::Run(dev_ctx,
-                y_data,
-                x_data,
-                dev_ctx.template Alloc<T>(Out),
-                1,
-                1,
-                M,
-                false,
-                true,
-                matmul_planner);
-    return;
-  }
-
-  if (x_ndim == 1) {
-    const int N = X.numel();
-    if (trans_y) {
-      PADDLE_ENFORCE_EQ(
-          y_dims[y_ndim - 1],
-          N,
-          phi::errors::InvalidArgument("Input(Y) has error dim. "
-                                       "Y'dims[%d] must be equal to %d, "
-                                       "but received Y'dims[%d] is %d.",
-                                       y_ndim - 1,
-                                       N,
-                                       y_ndim - 1,
-                                       y_dims[y_ndim - 1]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          y_dims[y_ndim - 2],
-          N,
-          phi::errors::InvalidArgument("Input(Y) has error dim. "
-                                       "Y'dims[%d] must be equal to %d, "
-                                       "but received Y'dims[%d] is %d.",
-                                       y_ndim - 2,
-                                       N,
-                                       y_ndim - 2,
-                                       y_dims[y_ndim - 2]));
-    }
-    std::vector<std::int64_t> out_dims(y_ndim - 1);
-    if (trans_y) {
-      std::copy_n(y_dims.cbegin(), y_ndim - 1, out_dims.begin());
-    } else {
-      std::copy_n(y_dims.cbegin(), y_ndim - 2, out_dims.begin());
-      out_dims.back() = y_dims.back();
-    }
-    Out->ResizeAndAllocate(common::make_ddim(out_dims));
-    dev_ctx.template Alloc<T>(Out);
-    if (trans_y) {
-      const int M = Y.numel() / N;
-      VLOG(3) << "MatMul with blaslt 2";
-      blaslt::Run(dev_ctx,
-                  y_data,
-                  x_data,
-                  dev_ctx.template Alloc<T>(Out),
-                  M,
-                  1,
-                  N,
-                  false,
-                  false,
-                  matmul_planner);
-    } else {
-      const int M = y_dims[y_ndim - 1];
-      const int batch_size = Y.numel() / (M * N);
-      if (batch_size == 1) {
-        VLOG(3) << "MatMul with blaslt 3";
-        blaslt::Run(dev_ctx,
-                    y_data,
-                    x_data,
-                    dev_ctx.template Alloc<T>(Out),
-                    M,
-                    1,
-                    N,
-                    true,
-                    false,
-                    matmul_planner);
-      } else {
-        VLOG(3) << "MatMul with blaslt 4";
-        blaslt::RunWithBatch(dev_ctx,
-                             y_data,
-                             x_data,
-                             dev_ctx.template Alloc<T>(Out),
-                             M,
-                             1,
-                             N,
-                             true,
-                             false,
-                             batch_size,
-                             M * N,
-                             0,
-                             M,
-                             matmul_planner);
-      }
-    }
-    return;
-  }
-
-  if (y_ndim == 1) {
-    const int N = Y.numel();
-    if (trans_x) {
-      PADDLE_ENFORCE_EQ(
-          x_dims[x_ndim - 2],
-          N,
-          phi::errors::InvalidArgument("Input(X) has error dim."
-                                       "X'dims[%d] must be equal to %d"
-                                       "But received X'dims[%d] is %d",
-                                       x_ndim - 2,
-                                       N,
-                                       x_ndim - 2,
-                                       x_dims[x_ndim - 2]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          x_dims[x_ndim - 1],
-          N,
-          phi::errors::InvalidArgument("Input(X) has error dim."
-                                       "X'dims[%d] must be equal to %d"
-                                       "But received X'dims[%d] is %d",
-                                       x_ndim - 1,
-                                       N,
-                                       x_ndim - 1,
-                                       x_dims[x_ndim - 1]));
-    }
-    std::vector<std::int64_t> out_dims(x_ndim - 1);
-    if (trans_x) {
-      std::copy_n(x_dims.cbegin(), x_ndim - 2, out_dims.begin());
-      out_dims.back() = x_dims.back();
-    } else {
-      std::copy_n(x_dims.cbegin(), x_ndim - 1, out_dims.begin());
-    }
-    Out->ResizeAndAllocate(common::make_ddim(out_dims));
-    dev_ctx.template Alloc<T>(Out);
-
-    if (trans_x) {
-      const int M = x_dims[x_ndim - 1];
-      const int batch_size = X.numel() / (M * N);
-      if (batch_size == 1) {
-        VLOG(3) << "MatMul with blaslt 5";
-        blaslt::Run(dev_ctx,
-                    x_data,
-                    y_data,
-                    dev_ctx.template Alloc<T>(Out),
-                    M,
-                    1,
-                    N,
-                    true,
-                    false,
-                    matmul_planner);
-      } else {
-        VLOG(3) << "MatMul with blaslt 6";
-        blaslt::RunWithBatch(dev_ctx,
-                             x_data,
-                             y_data,
-                             dev_ctx.template Alloc<T>(Out),
-                             M,
-                             1,
-                             N,
-                             true,
-                             false,
-                             batch_size,
-                             M * N,
-                             0,
-                             M,
-                             matmul_planner);
-      }
-    } else {
-      const int M = X.numel() / N;
-      VLOG(3) << "MatMul with blaslt 7";
-      blaslt::Run(dev_ctx,
-                  x_data,
-                  y_data,
-                  dev_ctx.template Alloc<T>(Out),
-                  M,
-                  1,
-                  N,
-                  false,
-                  false,
-                  matmul_planner);
-    }
-    return;
-  }
-
-  const int M = trans_x ? x_dims[x_ndim - 1] : x_dims[x_ndim - 2];
-  const int K = trans_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1];
-  if (trans_y) {
-    PADDLE_ENFORCE_EQ(
-        y_dims[y_ndim - 1],
-        K,
-        phi::errors::InvalidArgument("Input(Y) has error dim. "
-                                     "Y'dims[%d] must be equal to %d, "
-                                     "but received Y'dims[%d] is %d.",
-                                     y_ndim - 1,
-                                     K,
-                                     y_ndim - 1,
-                                     y_dims[y_ndim - 1]));
-  } else {
-    PADDLE_ENFORCE_EQ(
-        y_dims[y_ndim - 2],
-        K,
-        phi::errors::InvalidArgument("Input(Y) has error dim. "
-                                     "Y'dims[%d] must be equal to %d, "
-                                     "but received Y'dims[%d] is %d.",
-                                     y_ndim - 2,
-                                     K,
-                                     y_ndim - 2,
-                                     y_dims[y_ndim - 2]));
-  }
-  const int N = trans_y ? y_dims[y_ndim - 2] : y_dims[y_ndim - 1];
-  const int ndim = (std::max)(x_ndim, y_ndim);
-  std::vector<std::int64_t> x_broadcast_dims(ndim);
-  std::vector<std::int64_t> y_broadcast_dims(ndim);
-  std::vector<std::int64_t> out_broadcast_dims(ndim);
-
-  GetBroadcastFromDims(x_ndim - 2,
-                       x_dims.data(),
-                       y_ndim - 2,
-                       y_dims.data(),
-                       x_broadcast_dims.data(),
-                       y_broadcast_dims.data(),
-                       out_broadcast_dims.data());
-  out_broadcast_dims[ndim - 2] = M;
-  out_broadcast_dims[ndim - 1] = N;
-
-  Out->ResizeAndAllocate(common::make_ddim(out_broadcast_dims));
-  dev_ctx.template Alloc<T>(Out);
-
-  const int batch_dim = ndim - 2;
-  // broadcast message
-  const bool is_broadcast_dims =
-      !std::equal(x_broadcast_dims.cbegin(),
-                  x_broadcast_dims.cbegin() + batch_dim,
-                  y_broadcast_dims.cbegin());
-
-  const std::int64_t x_batch_size =
-      std::accumulate(x_broadcast_dims.cbegin(),
-                      x_broadcast_dims.cbegin() + batch_dim,
-                      1LL,
-                      std::multiplies<std::int64_t>());
-  const std::int64_t y_batch_size =
-      std::accumulate(y_broadcast_dims.cbegin(),
-                      y_broadcast_dims.cbegin() + batch_dim,
-                      1LL,
-                      std::multiplies<std::int64_t>());
-  const std::int64_t out_batch_size =
-      std::accumulate(out_broadcast_dims.cbegin(),
-                      out_broadcast_dims.cbegin() + batch_dim,
-                      1LL,
-                      std::multiplies<std::int64_t>());
-  if (out_batch_size == 0) return;
-  if (x_batch_size == 1 && y_batch_size == 1) {
-    VLOG(3) << "MatMul with blaslt 8";
-    blaslt::Run(dev_ctx,
-                x_data,
-                y_data,
-                dev_ctx.template Alloc<T>(Out),
-                M,
-                N,
-                K,
-                trans_x,
-                trans_y,
-                matmul_planner);
-  } else if (x_batch_size == 1) {
-    if (M == 1 && trans_y) {
-      VLOG(3) << "MatMul with blaslt 9";
-      blaslt::Run(dev_ctx,
-                  y_data,
-                  x_data,
-                  dev_ctx.template Alloc<T>(Out),
-                  y_batch_size * N,
-                  1,
-                  K,
-                  false,
-                  false,
-                  matmul_planner);
-    } else {
-      VLOG(3) << "MatMul with blaslt 10";
-      blaslt::RunWithBatch(dev_ctx,
-                           x_data,
-                           y_data,
-                           dev_ctx.template Alloc<T>(Out),
-                           M,
-                           N,
-                           K,
-                           trans_x,
-                           trans_y,
-                           out_batch_size,
-                           0,
-                           K * N,
-                           M * N,
-                           matmul_planner);
-    }
-  } else if (y_batch_size == 1) {
-    if (!trans_x) {
-      VLOG(3) << "MatMul with blaslt 11";
-      blaslt::Run(dev_ctx,
-                  x_data,
-                  y_data,
-                  dev_ctx.template Alloc<T>(Out),
-                  x_batch_size * M,
-                  N,
-                  K,
-                  false,
-                  trans_y,
-                  matmul_planner);
-    } else {
-      VLOG(3) << "MatMul with blaslt 12";
-      blaslt::RunWithBatch(dev_ctx,
-                           x_data,
-                           y_data,
-                           dev_ctx.template Alloc<T>(Out),
-                           M,
-                           N,
-                           K,
-                           true,
-                           trans_y,
-                           out_batch_size,
-                           M * K,
-                           0,
-                           M * N,
-                           matmul_planner);
-    }
-  } else if (!is_broadcast_dims) {
-    VLOG(3) << "MatMul with blaslt 13";
-    blaslt::RunWithBatch(dev_ctx,
-                         x_data,
-                         y_data,
-                         dev_ctx.template Alloc<T>(Out),
-                         M,
-                         N,
-                         K,
-                         trans_x,
-                         trans_y,
-                         out_batch_size,
-                         M * K,
-                         K * N,
-                         M * N,
-                         matmul_planner);
-  } else {
-    // in the case, can't use stridedgemm
-    std::vector<const T*> x_ptr(out_batch_size);
-    std::vector<const T*> y_ptr(out_batch_size);
-    std::vector<T*> out_ptr(out_batch_size);
-    std::vector<std::int64_t> index(batch_dim, 0);
-    for (std::int64_t i = 0; i < out_batch_size; ++i) {
-      // using the index to get offset
-      const std::int64_t x_index =
-          GetIndexMessage(batch_dim, x_broadcast_dims.data(), index.data());
-      const std::int64_t y_index =
-          GetIndexMessage(batch_dim, y_broadcast_dims.data(), index.data());
-
-      x_ptr[i] = x_data + x_index * M * K;
-      y_ptr[i] = y_data + y_index * K * N;
-      out_ptr[i] = dev_ctx.template Alloc<T>(Out) + i * M * N;
-      IndexIncreaseFromDims(batch_dim, out_broadcast_dims.data(), index.data());
-    }
-    VLOG(3) << "MatMul with blaslt 14";
-    blaslt::RunWithBatch(dev_ctx,
-                         x_ptr.data(),
-                         y_ptr.data(),
-                         out_ptr.data(),
-                         M,
-                         N,
-                         K,
-                         trans_x,
-                         trans_y,
-                         out_batch_size,
-                         matmul_planner);
-  }
-}
-#endif
-
-template <typename Context, typename T>
-struct MatMulDispatcher {
-  void operator()(const Context& ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  const std::vector<std::int64_t>& x_dims,
-                  const std::vector<std::int64_t>& y_dims,
-                  DenseTensor* out,
-                  bool trans_x,
-                  bool trans_y,
-                  bool flag = false) {
-    MatMulFunctionImplWithBlas<Context, T>(
-        ctx, x, y, x_dims, y_dims, out, trans_x, trans_y, flag);
-  }
-};
-
-#ifdef PADDLE_WITH_CUDA
-template <typename T>
-struct MatMulDispatcher<phi::GPUContext, T> {
-  void operator()(const phi::GPUContext& ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  const std::vector<std::int64_t>& x_dims,
-                  const std::vector<std::int64_t>& y_dims,
-                  DenseTensor* out,
-                  bool trans_x,
-                  bool trans_y,
-                  bool flag = false) {
-#if CUDA_VERSION >= 11060 && 0
-    auto* tuner = phi::autotune::MakeMatmulTuner<T>(
-        MatMulFunctionImplWithBlas<phi::GPUContext, T>);
-    tuner->AddCallBack(MatMulFunctionImplWithCublasLt<phi::GPUContext, T>);
-    phi::funcs::MatmulPlanner matmul_planner(x_dims,
-                                             y_dims,
-                                             trans_x,
-                                             trans_y,
-                                             phi::CppTypeToDataType<T>::Type(),
-                                             funcs::MatmulFusedType::kMatmul,
-                                             /* bias_data */ nullptr,
-                                             /* reserve_data */ nullptr,
-                                             /* use_addto */ flag,
-                                             /* no_exchange */ true);
-    tuner->Run(ctx,
-               matmul_planner.GetKey(),
-               ctx,
-               x,
-               y,
-               x_dims,
-               y_dims,
-               out,
-               trans_x,
-               trans_y,
-               flag,
-               &matmul_planner);
-#else
-    MatMulFunctionImplWithBlas<phi::GPUContext, T>(
-        ctx, x, y, x_dims, y_dims, out, trans_x, trans_y, flag);
-#endif
-  }
-};
-
-#endif  // PADDLE_WITH_CUDA
-
-template <typename Context, typename T>
-void MatMulFunction(const Context& ctx,
-                    const DenseTensor& x,
-                    const DenseTensor& y,
-                    const std::vector<std::int64_t>& x_dims,
-                    const std::vector<std::int64_t>& y_dims,
-                    DenseTensor* out,
-                    bool trans_x,
-                    bool trans_y,
-                    bool flag = false) {
-  MatMulDispatcher<Context, T>()(
-      ctx, x, y, x_dims, y_dims, out, trans_x, trans_y, flag);
-}
-
-template <typename Context>
-bool MatMulInt8Function(const Context& ctx,
-                        const DenseTensor& x,
-                        const DenseTensor& y,
-                        const std::vector<std::int64_t>& x_dims,
-                        const std::vector<std::int64_t>& y_dims,
-                        DenseTensor* out,
-                        bool trans_x,
-                        bool trans_y) {
-  return false;
-}
-
-#ifdef PADDLE_WITH_CUDA
-template <>
-bool inline MatMulInt8Function(const phi::GPUContext& ctx,
-                               const DenseTensor& x,
-                               const DenseTensor& y,
-                               const std::vector<std::int64_t>& x_dims,
-                               const std::vector<std::int64_t>& y_dims,
-                               DenseTensor* out,
-                               bool trans_x,
-                               bool trans_y) {
-  if (x.dtype() != DataType::INT8 || y.dtype() != DataType::INT8) {
-    return false;
-  }
-#if CUDA_VERSION >= 11060 && 0
-  const int x_ndim = x_dims.size();
-  const int y_ndim = y_dims.size();
-  const int8_t* x_data = x.data<int8_t>();
-  const int8_t* y_data = y.data<int8_t>();
-  using blaslt = phi::funcs::MatmulWithCublasLt<int8_t, int32_t>;
-
-  phi::funcs::MatmulPlanner matmul_planner(
-      x_dims,
-      y_dims,
-      trans_x,
-      trans_y,
-      phi::CppTypeToDataType<int8_t>::Type(),
-      funcs::MatmulFusedType::kMatmul,
-      /* bias_data */ nullptr,
-      /* reserve_data */ nullptr,
-      /* use_addto */ false,
-      /* no_exchange */ true);
-
-  if (x_ndim == 1 && y_ndim == 1) {
-    const int M = x.numel();
-    const int N = y.numel();
-    PADDLE_ENFORCE_EQ(
-        M,
-        N,
-        phi::errors::InvalidArgument(
-            "X's numbers must be equal to Y's numbers,"
-            "when X/Y's dims =1. But received X has [%d] elements,"
-            "received Y has [%d] elements",
-            M,
-            N));
-    if (!(M % 4 == 0)) {
-      return false;
-    }
-
-    out->Resize(common::make_ddim({}));
-    ctx.template Alloc<int32_t>(out);
-    blaslt::Run(ctx,
-                y_data,
-                x_data,
-                ctx.template Alloc<int32_t>(out),
-                1,
-                1,
-                M,
-                false,
-                true,
-                &matmul_planner);
-    return true;
-  }
-  if (x_ndim == 1) {
-    const int N = x.numel();
-    if (trans_y) {
-      PADDLE_ENFORCE_EQ(
-          y_dims[y_ndim - 1],
-          N,
-          phi::errors::InvalidArgument("Input(Y) has error dim. "
-                                       "Y'dims[%d] must be equal to %d, "
-                                       "but received Y'dims[%d] is %d.",
-                                       y_ndim - 1,
-                                       N,
-                                       y_ndim - 1,
-                                       y_dims[y_ndim - 1]));
-      if (!(N % 4 == 0)) {
-        return false;
-      }
-    } else {
-      PADDLE_ENFORCE_EQ(
-          y_dims[y_ndim - 2],
-          N,
-          phi::errors::InvalidArgument("Input(Y) has error dim. "
-                                       "Y'dims[%d] must be equal to %d, "
-                                       "but received Y'dims[%d] is %d.",
-                                       y_ndim - 2,
-                                       N,
-                                       y_ndim - 2,
-                                       y_dims[y_ndim - 2]));
-      const int M = y.numel() / N;
-      if (!(M == 1 || M % 4 == 0)) {
-        return false;
-      }
-    }
-    std::vector<std::int64_t> out_dims(y_ndim - 1);
-    if (trans_y) {
-      std::copy_n(y_dims.cbegin(), y_ndim - 1, out_dims.begin());
-    } else {
-      std::copy_n(y_dims.cbegin(), y_ndim - 2, out_dims.begin());
-      out_dims.back() = y_dims.back();
-    }
-    out->ResizeAndAllocate(common::make_ddim(out_dims));
-    ctx.template Alloc<int32_t>(out);
-    if (trans_y) {
-      const int M = y.numel() / N;
-      blaslt::Run(ctx,
-                  y_data,
-                  x_data,
-                  ctx.template Alloc<int32_t>(out),
-                  M,
-                  1,
-                  N,
-                  false,
-                  false,
-                  &matmul_planner);
-    } else {
-      const int M = y_dims[y_ndim - 1];
-      const int batch_size = y.numel() / (M * N);
-      if (batch_size == 1) {
-        blaslt::Run(ctx,
-                    y_data,
-                    x_data,
-                    ctx.template Alloc<int32_t>(out),
-                    M,
-                    1,
-                    N,
-                    true,
-                    false,
-                    &matmul_planner);
-      } else {
-        blaslt::RunWithBatch(ctx,
-                             y_data,
-                             x_data,
-                             ctx.template Alloc<int32_t>(out),
-                             M,
-                             1,
-                             N,
-                             true,
-                             false,
-                             batch_size,
-                             M * N,
-                             0,
-                             M,
-                             &matmul_planner);
-      }
-    }
-    return true;
-  }
-
-  if (y_ndim == 1) {
-    const int N = y.numel();
-    if (trans_x) {
-      PADDLE_ENFORCE_EQ(
-          x_dims[x_ndim - 2],
-          N,
-          phi::errors::InvalidArgument("Input(X) has error dim."
-                                       "X'dims[%d] must be equal to %d"
-                                       "But received X'dims[%d] is %d",
-                                       x_ndim - 2,
-                                       N,
-                                       x_ndim - 2,
-                                       x_dims[x_ndim - 2]));
-      const int M = x.numel() / N;
-      if (!((M == 1 || M % 4 == 0))) {
-        return false;
-      }
-    } else {
-      PADDLE_ENFORCE_EQ(
-          x_dims[x_ndim - 1],
-          N,
-          phi::errors::InvalidArgument("Input(X) has error dim."
-                                       "X'dims[%d] must be equal to %d"
-                                       "But received X'dims[%d] is %d",
-                                       x_ndim - 1,
-                                       N,
-                                       x_ndim - 1,
-                                       x_dims[x_ndim - 1]));
-      if (N % 4 != 0) {
-        return false;
-      }
-    }
-    std::vector<std::int64_t> out_dims(x_ndim - 1);
-    if (trans_x) {
-      std::copy_n(x_dims.cbegin(), x_ndim - 2, out_dims.begin());
-      out_dims.back() = x_dims.back();
-    } else {
-      std::copy_n(x_dims.cbegin(), x_ndim - 1, out_dims.begin());
-    }
-    out->ResizeAndAllocate(common::make_ddim(out_dims));
-    ctx.template Alloc<int32_t>(out);
-
-    if (trans_x) {
-      const int M = x_dims[x_ndim - 1];
-      const int batch_size = x.numel() / (M * N);
-      if (batch_size == 1) {
-        blaslt::Run(ctx,
-                    x_data,
-                    y_data,
-                    ctx.template Alloc<int32_t>(out),
-                    M,
-                    1,
-                    N,
-                    true,
-                    false,
-                    &matmul_planner);
-      } else {
-        blaslt::RunWithBatch(ctx,
-                             x_data,
-                             y_data,
-                             ctx.template Alloc<int32_t>(out),
-                             M,
-                             1,
-                             N,
-                             true,
-                             false,
-                             batch_size,
-                             M * N,
-                             0,
-                             M,
-                             &matmul_planner);
-      }
-    } else {
-      const int M = x.numel() / N;
-      blaslt::Run(ctx,
-                  x_data,
-                  y_data,
-                  ctx.template Alloc<int32_t>(out),
-                  M,
-                  1,
-                  N,
-                  false,
-                  false,
-                  &matmul_planner);
-    }
-    return true;
-  }
-
-  const int M = trans_x ? x_dims[x_ndim - 1] : x_dims[x_ndim - 2];
-  const int K = trans_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1];
-  if (trans_y) {
-    PADDLE_ENFORCE_EQ(
-        y_dims[y_ndim - 1],
-        K,
-        phi::errors::InvalidArgument("Input(Y) has error dim. "
-                                     "Y'dims[%d] must be equal to %d, "
-                                     "but received Y'dims[%d] is %d.",
-                                     y_ndim - 1,
-                                     K,
-                                     y_ndim - 1,
-                                     y_dims[y_ndim - 1]));
-  } else {
-    PADDLE_ENFORCE_EQ(
-        y_dims[y_ndim - 2],
-        K,
-        phi::errors::InvalidArgument("Input(Y) has error dim. "
-                                     "Y'dims[%d] must be equal to %d, "
-                                     "but received Y'dims[%d] is %d.",
-                                     y_ndim - 2,
-                                     K,
-                                     y_ndim - 2,
-                                     y_dims[y_ndim - 2]));
-  }
-  const int N = trans_y ? y_dims[y_ndim - 2] : y_dims[y_ndim - 1];
-  const int ndim = (std::max)(x_ndim, y_ndim);
-  std::vector<std::int64_t> x_broadcast_dims(ndim);
-  std::vector<std::int64_t> y_broadcast_dims(ndim);
-  std::vector<std::int64_t> out_broadcast_dims(ndim);
-  GetBroadcastFromDims(x_ndim - 2,
-                       x_dims.data(),
-                       y_ndim - 2,
-                       y_dims.data(),
-                       x_broadcast_dims.data(),
-                       y_broadcast_dims.data(),
-                       out_broadcast_dims.data());
-  out_broadcast_dims[ndim - 2] = M;
-  out_broadcast_dims[ndim - 1] = N;
-
-  out->ResizeAndAllocate(common::make_ddim(out_broadcast_dims));
-  ctx.template Alloc<int32_t>(out);
-
-  const int batch_dim = ndim - 2;
-  // broadcast message
-  const bool is_broadcast_dims =
-      !std::equal(x_broadcast_dims.cbegin(),
-                  x_broadcast_dims.cbegin() + batch_dim,
-                  y_broadcast_dims.cbegin());
-
-  const std::int64_t x_batch_size =
-      std::accumulate(x_broadcast_dims.cbegin(),
-                      x_broadcast_dims.cbegin() + batch_dim,
-                      1LL,
-                      std::multiplies<std::int64_t>());
-  const std::int64_t y_batch_size =
-      std::accumulate(y_broadcast_dims.cbegin(),
-                      y_broadcast_dims.cbegin() + batch_dim,
-                      1LL,
-                      std::multiplies<std::int64_t>());
-  const std::int64_t out_batch_size =
-      std::accumulate(out_broadcast_dims.cbegin(),
-                      out_broadcast_dims.cbegin() + batch_dim,
-                      1LL,
-                      std::multiplies<std::int64_t>());
-  if (out_batch_size == 0) return true;
-
-  if (x_batch_size == 1 && M == 1 && trans_y) {
-    if (!(K % 4 == 0)) {
-      return false;
-    }
-  } else if (!trans_x && !trans_y) {
-    if (!(N % 4 == 0 || N == 1) || !(K % 4 == 0) || (M == 1 && N == 1)) {
-      return false;
-    }
-  } else if (!trans_x && trans_y) {
-    if (!(K % 4 == 0)) {
-      return false;
-    }
-  } else if (trans_x && !trans_y) {
-    if (!(M % 4 == 0 || M == 1) || !(N % 4 == 0 || N == 1)) {
-      return false;
-    }
-  } else {
-    if (!(M % 4 == 0 || M == 1) || !(K % 4 == 0)) {
-      return false;
-    }
-  }
-  if (x_batch_size == 1 && y_batch_size == 1) {
-    blaslt::Run(ctx,
-                x_data,
-                y_data,
-                ctx.template Alloc<int32_t>(out),
-                M,
-                N,
-                K,
-                trans_x,
-                trans_y,
-                &matmul_planner);
-  } else if (x_batch_size == 1) {
-    if (M == 1 && trans_y) {
-      blaslt::Run(ctx,
-                  y_data,
-                  x_data,
-                  ctx.template Alloc<int32_t>(out),
-                  y_batch_size * N,
-                  1,
-                  K,
-                  false,
-                  false,
-                  &matmul_planner);
-    } else {
-      blaslt::RunWithBatch(ctx,
-                           x_data,
-                           y_data,
-                           ctx.template Alloc<int32_t>(out),
-                           M,
-                           N,
-                           K,
-                           trans_x,
-                           trans_y,
-                           out_batch_size,
-                           0,
-                           K * N,
-                           M * N,
-                           &matmul_planner);
-    }
-  } else if (y_batch_size == 1) {
-    if (!trans_x) {
-      blaslt::Run(ctx,
-                  x_data,
-                  y_data,
-                  ctx.template Alloc<int32_t>(out),
-                  x_batch_size * M,
-                  N,
-                  K,
-                  false,
-                  trans_y,
-                  &matmul_planner);
-    } else {
-      blaslt::RunWithBatch(ctx,
-                           x_data,
-                           y_data,
-                           ctx.template Alloc<int32_t>(out),
-                           M,
-                           N,
-                           K,
-                           true,
-                           trans_y,
-                           out_batch_size,
-                           M * K,
-                           0,
-                           M * N,
-                           &matmul_planner);
-    }
-  } else if (!is_broadcast_dims) {
-    blaslt::RunWithBatch(ctx,
-                         x_data,
-                         y_data,
-                         ctx.template Alloc<int32_t>(out),
-                         M,
-                         N,
-                         K,
-                         trans_x,
-                         trans_y,
-                         out_batch_size,
-                         M * K,
-                         K * N,
-                         M * N,
-                         &matmul_planner);
-  } else {
-    // in the case, can't use stridedgemm
-    std::vector<const int8_t*> x_ptr(out_batch_size);
-    std::vector<const int8_t*> y_ptr(out_batch_size);
-    std::vector<int32_t*> out_ptr(out_batch_size);
-    std::vector<std::int64_t> index(batch_dim, 0);
-    for (std::int64_t i = 0; i < out_batch_size; ++i) {
-      // using the index to get offset
-      const std::int64_t x_index =
-          GetIndexMessage(batch_dim, x_broadcast_dims.data(), index.data());
-      const std::int64_t y_index =
-          GetIndexMessage(batch_dim, y_broadcast_dims.data(), index.data());
-
-      x_ptr[i] = x_data + x_index * M * K;
-      y_ptr[i] = y_data + y_index * K * N;
-      out_ptr[i] = ctx.template Alloc<int32_t>(out) + i * M * N;
-      IndexIncreaseFromDims(batch_dim, out_broadcast_dims.data(), index.data());
-    }
-    blaslt::RunWithBatch(ctx,
-                         x_ptr.data(),
-                         y_ptr.data(),
-                         out_ptr.data(),
-                         M,
-                         N,
-                         K,
-                         trans_x,
-                         trans_y,
-                         out_batch_size,
-                         &matmul_planner);
-  }
-  return true;
-#else
-  return false;
-#endif
-}
-#endif
-
-template <typename Context, typename T>
-typename std::enable_if<std::is_integral<T>::value>::type
-MatmulJudgeDtypeKernel(const Context& ctx,
-                       const DenseTensor& x,
-                       const DenseTensor& y,
-                       const std::vector<std::int64_t>& x_dims,
-                       const std::vector<std::int64_t>& y_dims,
-                       DenseTensor* out,
-                       bool transpose_x,
-                       bool transpose_y) {
-  bool try_matmul_int8 = MatMulInt8Function<Context>(
-      ctx, x, y, x_dims, y_dims, out, transpose_x, transpose_y);
-  if (try_matmul_int8) {
-    return;
-  }
-  auto x_tmp = phi::Cast<T, Context>(ctx, x, phi::DataType::FLOAT32);
-  auto y_tmp = phi::Cast<T, Context>(ctx, y, phi::DataType::FLOAT32);
-  DenseTensor out_tmp;
-  MatMulFunction<Context, float>(
-      ctx, x_tmp, y_tmp, x_dims, y_dims, &out_tmp, transpose_x, transpose_y);
-  if (x.dtype() == phi::DataType::INT8) {
-    phi::CastKernel<float>(ctx, out_tmp, phi::DataType::INT32, out);
-    return;
-  }
-  phi::CastKernel<float>(ctx, out_tmp, x.dtype(), out);
-}
-
-template <typename Context, typename T>
-typename std::enable_if<!std::is_integral<T>::value>::type
-MatmulJudgeDtypeKernel(const Context& ctx,
-                       const DenseTensor& x,
-                       const DenseTensor& y,
-                       const std::vector<std::int64_t>& x_dims,
-                       const std::vector<std::int64_t>& y_dims,
-                       DenseTensor* out,
-                       bool transpose_x,
-                       bool transpose_y) {
-  MatMulFunction<Context, T>(
-      ctx, x, y, x_dims, y_dims, out, transpose_x, transpose_y);
-}
-
-template <typename T, typename Context>
-void MatmulKernel(const Context& ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  bool transpose_x,
-                  bool transpose_y,
-                  DenseTensor* out) {
-  if (x.numel() == 0 || y.numel() == 0) {
-    // input shape [1, 1, 5, 0], [1, 1, 0, 5], result shape is [1, 1, 5, 5]
-    phi::Full<T, Context>(
-        ctx, phi::IntArray(common::vectorize(out->dims())), 0, out);
-    return;
-  }
-  PADDLE_ENFORCE_GE(
-      common::product(x.dims()),
-      0,
-      common::errors::InvalidArgument(
-          "The dims of Input(X) should be greater than or equal to 0."));
-  PADDLE_ENFORCE_GE(
-      common::product(y.dims()),
-      0,
-      common::errors::InvalidArgument(
-          "The dims of Input(Y) should be greater than or equal to 0."));
-  const std::vector<std::int64_t> x_dims = common::vectorize(x.dims());
-  const std::vector<std::int64_t> y_dims = common::vectorize(y.dims());
-  MatmulJudgeDtypeKernel<Context, T>(
-      ctx, x, y, x_dims, y_dims, out, transpose_x, transpose_y);
-}
-
-template <typename T, typename Context>
-void MatmulWithFlattenKernelImpl(const Context& dev_ctx,
-                                 const DenseTensor& x,
-                                 const DenseTensor& y,
-                                 int x_num_col_dims,
-                                 int y_num_col_dims,
-                                 DenseTensor* out) {
-  const DenseTensor x_matrix =
-      x.dims().size() > 2 ? phi::ReshapeToMatrix(x, x_num_col_dims) : x;
-  const DenseTensor y_matrix =
-      y.dims().size() > 2 ? phi::ReshapeToMatrix(y, y_num_col_dims) : y;
-
-  dev_ctx.template Alloc<T>(out);
-  auto z_dim = out->dims();
-  if (z_dim.size() != 2) {
-    out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
-  }
-
-  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
-
-  blas.MatMul(x_matrix, y_matrix, out);
-  if (z_dim.size() != 2) {
-    out->Resize(z_dim);
-  }
-}
-
-#ifdef PADDLE_WITH_CUDA
-
-template <typename Context>
-void MatmulWithFlattenKernelInt8Impl(const Context& dev_ctx,
-                                     const DenseTensor& x,
-                                     const DenseTensor& y,
-                                     int x_num_col_dims,
-                                     int y_num_col_dims,
-                                     DenseTensor* out) {
-  PADDLE_ENFORCE_EQ(
-      x.dtype(),
-      DataType::INT8,
-      phi::errors::InvalidArgument(
-          "The type of input(x) used in int8 mul must be (%s) "
-          "does not match the "
-          "type of data (%s) currently contained in the container.",
-          phi::CppTypeToDataType<int8_t>::Type(),
-          x.dtype()));
-  PADDLE_ENFORCE_EQ(
-      y.dtype(),
-      DataType::INT8,
-      phi::errors::InvalidArgument(
-          "The type of input(y) used in int8 mul must be (%s) "
-          "does not match the "
-          "type of data (%s) currently contained in the container.",
-          phi::CppTypeToDataType<int8_t>::Type(),
-          y.dtype()));
-
-  const DenseTensor x_matrix =
-      x.dims().size() > 2 ? phi::ReshapeToMatrix(x, x_num_col_dims) : x;
-  const DenseTensor y_matrix =
-      y.dims().size() > 2 ? phi::ReshapeToMatrix(y, y_num_col_dims) : y;
-
-  PADDLE_ENFORCE_EQ(
-      x_matrix.dims()[1],
-      y_matrix.dims()[0],
-      phi::errors::InvalidArgument(
-          "X's numbers of columns must be equal to Y's numbers of rows."
-          "But received X has [%d] columns,"
-          "received Y has [%d] rows",
-          x_matrix.dims()[1],
-          y_matrix.dims()[0]));
-
-  PADDLE_ENFORCE_EQ((y_matrix.dims()[1] % 4 == 0 || y_matrix.dims()[1] == 1),
-                    true,
-                    phi::errors::InvalidArgument(
-                        "The dimension size N used in int8 mul must be 1"
-                        "or a multiple of 4 does not match the size (%d)"
-                        "currently contained in the container.",
-                        y_matrix.dims()[1]));
-  PADDLE_ENFORCE_EQ((x_matrix.dims()[1] % 4 == 0),
-                    true,
-                    phi::errors::InvalidArgument(
-                        "The dimension size K used in int8 mul must be a"
-                        "multiple of 4 does not match the size (%d) currently"
-                        "contained in the container.",
-                        x_matrix.dims()[1]));
-
-  dev_ctx.template Alloc<int32_t>(out);
-  auto z_dim = out->dims();
-  if (z_dim.size() != 2) {
-    out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
-  }
-
-#if CUDA_VERSION >= 11060 && 0
-  using blaslt = phi::funcs::MatmulWithCublasLt<int8_t, int32_t>;
-
-  const int8_t* x_data = x_matrix.data<int8_t>();
-  const int8_t* y_data = y_matrix.data<int8_t>();
-
-  std::vector<std::int64_t> x_dims = {x_matrix.dims()[0], x_matrix.dims()[1]};
-  std::vector<std::int64_t> y_dims = {y_matrix.dims()[0], y_matrix.dims()[1]};
-  phi::funcs::MatmulPlanner matmul_planner(
-      x_dims,
-      y_dims,
-      false,
-      false,
-      phi::CppTypeToDataType<int8_t>::Type(),
-      funcs::MatmulFusedType::kMatmul,
-      /* bias_data */ nullptr,
-      /* reserve_data */ nullptr,
-      /* use_addto */ false,
-      /* no_exchange */ true);
-
-  blaslt::Run(dev_ctx,
-              x_data,
-              y_data,
-              dev_ctx.template Alloc<int32_t>(out),
-              x_matrix.dims()[0],
-              y_matrix.dims()[1],
-              x_matrix.dims()[1],
-              false,
-              false,
-              &matmul_planner);
-
-  if (z_dim.size() != 2) {
-    out->Resize(z_dim);
-  }
-#endif
-}
-#endif
-
-#ifdef PADDLE_WITH_CUDA
-template <typename Context>
-typename std::enable_if<std::is_same<Context, phi::GPUContext>::value,
-                        void>::type
-DispatchMatmulWithFlattenInt8Kernel(const phi::GPUContext& dev_ctx,
-                                    const DenseTensor& x,
-                                    const DenseTensor& y,
-                                    int x_num_col_dims,
-                                    int y_num_col_dims,
-                                    DenseTensor* out) {
-  MatmulWithFlattenKernelInt8Impl<Context>(
-      dev_ctx, x, y, x_num_col_dims, y_num_col_dims, out);
-}
-#endif
-
-template <typename Context>
-typename std::enable_if<std::is_same<Context, phi::CPUContext>::value,
-                        void>::type
-DispatchMatmulWithFlattenInt8Kernel(const phi::CPUContext& dev_ctx,
-                                    const DenseTensor& x,
-                                    const DenseTensor& y,
-                                    int x_num_col_dims,
-                                    int y_num_col_dims,
-                                    DenseTensor* out) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      "MatmulWithFlatten with CPU is NOT implemented "
-      "yet."));
-}
-
-template <typename T, typename Context>
-typename std::enable_if<std::is_same<T, int8_t>::value, void>::type
-DispatchMatmulFlattenKernel(const Context& dev_ctx,
-                            const DenseTensor& x,
-                            const DenseTensor& y,
-                            int x_num_col_dims,
-                            int y_num_col_dims,
-                            DenseTensor* out) {
-  DispatchMatmulWithFlattenInt8Kernel<Context>(
-      dev_ctx, x, y, x_num_col_dims, y_num_col_dims, out);
-}
-
-template <typename T, typename Context>
-typename std::enable_if<!std::is_same<T, int8_t>::value, void>::type
-DispatchMatmulFlattenKernel(const Context& dev_ctx,
-                            const DenseTensor& x,
-                            const DenseTensor& y,
-                            int x_num_col_dims,
-                            int y_num_col_dims,
-                            DenseTensor* out) {
-  MatmulWithFlattenKernelImpl<T, Context>(
-      dev_ctx, x, y, x_num_col_dims, y_num_col_dims, out);
-}
-
-template <typename T, typename Context>
-void MatmulWithFlattenKernel(const Context& dev_ctx,
-                             const DenseTensor& x,
-                             const DenseTensor& y,
-                             int x_num_col_dims,
-                             int y_num_col_dims,
-                             DenseTensor* out) {
-  DispatchMatmulFlattenKernel<T, Context>(
-      dev_ctx, x, y, x_num_col_dims, y_num_col_dims, out);
-}
-
-template <typename T, typename Context>
-void LegacyMatmulKernel(const Context& ctx,
-                        const DenseTensor& x,
-                        const DenseTensor& y,
-                        bool transpose_x,
-                        bool transpose_y,
-                        float alpha,
-                        DenseTensor* out) {
-  MatmulKernel<T, Context>(ctx, x, y, transpose_x, transpose_y, out);
-  if (std::fabs(alpha - 1.f) > 1e-6f) {
-    ScaleKernel<T, Context>(ctx, *out, Scalar(alpha), Scalar(0), false, out);
-  }
-}
-
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/impl/matmul_kernel_impl_maca.h b/backends/metax_gpu/kernels/impl/matmul_kernel_impl_maca.h
deleted file mode 100644
index 9750abae5ca..00000000000
--- a/backends/metax_gpu/kernels/impl/matmul_kernel_impl_maca.h
+++ /dev/null
@@ -1,1696 +0,0 @@
-// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights
-// Reserved.
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-// clang-format off
-#include "glog/logging.h"
-
-#include "paddle/phi/common/memory_utils.h"
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/kernels/autotune/cache_base.h"
-#include "paddle/phi/kernels/cast_kernel.h"
-#include "../funcs/blas/blas.h"
-#ifdef PADDLE_WITH_HIP
-#include "paddle/phi/kernels/funcs/blas/blaslt_impl.hip.h"
-#else
-#include "../funcs/blas/blaslt_impl.cu.h"
-#endif
-#include "paddle/phi/kernels/funcs/complex_functors.h"
-#include "paddle/phi/kernels/scale_kernel.h"
-#if defined(PADDLE_WITH_CUDA)
-#include "paddle/phi/kernels/funcs/cublaslt.h"
-#include "paddle/phi/kernels/gpu/cuda_gemm_kernel.h"
-#include "paddle/phi/kernels/transpose_kernel.h"
-#elif defined(PADDLE_WITH_HIP)
-#include "paddle/phi/kernels/funcs/hipblaslt.h"
-#endif
-#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060 && 0
-#include "paddle/phi/kernels/autotune/auto_tune_base.h"
-#endif
-// clang-format on
-namespace phi {
-
-static void GetBroadcastFromDims(const int x_ndim,
-                                 const std::int64_t* x_dims,
-                                 const int y_ndim,
-                                 const std::int64_t* y_dims,
-                                 std::int64_t* x_bd_dims,
-                                 std::int64_t* y_bd_dims,
-                                 std::int64_t* out_bd_dims) {
-  const int ndim = (std::max)(x_ndim, y_ndim);
-  std::fill(x_bd_dims, x_bd_dims + ndim - x_ndim, 1);
-  std::fill(y_bd_dims, y_bd_dims + ndim - y_ndim, 1);
-  std::copy(x_dims, x_dims + x_ndim, x_bd_dims + ndim - x_ndim);
-  std::copy(y_dims, y_dims + y_ndim, y_bd_dims + ndim - y_ndim);
-
-  for (int i = 0; i < ndim; ++i) {
-    PADDLE_ENFORCE_EQ(
-        x_bd_dims[i] == y_bd_dims[i] || x_bd_dims[i] <= 1 || y_bd_dims[i] <= 1,
-        true,
-        phi::errors::InvalidArgument(
-            "Input(X) and Input(Y) has error dim. "
-            "X_broadcast's shape[%s] must be equal to Y_broadcast's shape[%s], "
-            "or X_broadcast's shape[%s] <= 1, or Y_broadcast's shape[%s] <= 1, "
-            "but received X_broadcast's shape[%s] = [%s]"
-            "received Y_broadcast's shape[%s] = [%s].",
-            i,
-            i,
-            i,
-            i,
-            i,
-            x_bd_dims[i],
-            i,
-            y_bd_dims[i]));
-    if (x_bd_dims[i] == 0 || y_bd_dims[i] == 0) {
-      out_bd_dims[i] = 0;
-    } else {
-      out_bd_dims[i] = (std::max)(x_bd_dims[i], y_bd_dims[i]);
-    }
-  }
-}
-
-static int64_t GetIndexMessage(const int n,
-                               const int64_t* dims,
-                               const int64_t* index) {
-  int64_t sum = 0;
-  for (int i = 0; i < n; ++i) {
-    if (dims[i] > 1) {
-      sum = sum * dims[i] + index[i];
-    }
-  }
-  return sum;
-}
-
-static void IndexIncreaseFromDims(const int ndim,
-                                  const int64_t* dims,
-                                  int64_t* index) {
-  for (int i = ndim - 1; i >= 0; --i) {
-    ++index[i];
-    if (index[i] >= dims[i]) {
-      index[i] -= dims[i];
-    } else {
-      break;
-    }
-  }
-}
-
-// The general implementation with blas.
-template <typename Context, typename T>
-void MatMulFunctionImplWithBlas(
-    const Context& dev_ctx,
-    const DenseTensor& X,
-    const DenseTensor& Y,
-    const std::vector<std::int64_t>& x_dims,
-    const std::vector<std::int64_t>& y_dims,
-    DenseTensor* Out,
-    bool trans_x,
-    bool trans_y,
-    bool flag = false,
-    phi::funcs::MatmulPlanner* matmul_planner UNUSED = nullptr) {
-  const int x_ndim = x_dims.size();
-  const int y_ndim = y_dims.size();
-
-  // Get data ptr
-  const T* x_data = X.data<T>();
-  const T* y_data = Y.data<T>();
-
-  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
-
-  if (x_ndim == 1 && y_ndim == 1) {
-    const int M = X.numel();
-    const int N = Y.numel();
-    PADDLE_ENFORCE_EQ(
-        M,
-        N,
-        phi::errors::InvalidArgument(
-            "X's numbers must be equal to Y's numbers, "
-            "when X/Y's dims =1. But received X has [%d] elements, "
-            "received Y has [%d] elements.",
-            M,
-            N));
-    VLOG(3) << "MatMul's case 1";
-    Out->Resize(common::make_ddim({}));
-    dev_ctx.template Alloc<T>(Out);
-    blas.GEMM(CblasNoTrans,
-              CblasTrans,
-              1,
-              1,
-              M,
-              static_cast<T>(1),
-              y_data,
-              x_data,
-              static_cast<T>(flag),
-              dev_ctx.template Alloc<T>(Out));
-    return;
-  }
-
-  if (x_ndim == 1) {
-    const int N = X.numel();
-    if (trans_y) {
-      PADDLE_ENFORCE_EQ(
-          y_dims[y_ndim - 1],
-          N,
-          phi::errors::InvalidArgument("Input(Y) has error dim. "
-                                       "Y'dims[%d] must be equal to %d, "
-                                       "but received Y'dims[%d] is %d.",
-                                       y_ndim - 1,
-                                       N,
-                                       y_ndim - 1,
-                                       y_dims[y_ndim - 1]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          y_dims[y_ndim - 2],
-          N,
-          phi::errors::InvalidArgument("Input(Y) has error dim. "
-                                       "Y'dims[%d] must be equal to %d, "
-                                       "but received Y'dims[%d] is %d.",
-                                       y_ndim - 2,
-                                       N,
-                                       y_ndim - 2,
-                                       y_dims[y_ndim - 2]));
-    }
-    std::vector<std::int64_t> out_dims(y_ndim - 1);
-    if (trans_y) {
-      std::copy_n(y_dims.cbegin(), y_ndim - 1, out_dims.begin());
-    } else {
-      std::copy_n(y_dims.cbegin(), y_ndim - 2, out_dims.begin());
-      out_dims.back() = y_dims.back();
-    }
-    Out->ResizeAndAllocate(common::make_ddim(out_dims));
-    dev_ctx.template Alloc<T>(Out);
-    if (trans_y) {
-      const int M = Y.numel() / N;
-      VLOG(3) << "MatMul's case 2";
-      blas.GEMV(false,
-                M,
-                N,
-                static_cast<T>(1),
-                y_data,
-                x_data,
-                static_cast<T>(flag),
-                dev_ctx.template Alloc<T>(Out));
-    } else {
-      const int M = y_dims[y_ndim - 1];
-      const int batch_size = Y.numel() / (M * N);
-      if (batch_size == 1) {
-        VLOG(3) << "MatMul's case 3";
-        blas.GEMV(true,
-                  N,
-                  M,
-                  static_cast<T>(1),
-                  y_data,
-                  x_data,
-                  static_cast<T>(flag),
-                  dev_ctx.template Alloc<T>(Out));
-      } else {
-        VLOG(3) << "MatMul's case 4";
-        blas.BatchedGEMM(CblasTrans,
-                         CblasNoTrans,
-                         M,
-                         1,
-                         N,
-                         static_cast<T>(1),
-                         y_data,
-                         x_data,
-                         static_cast<T>(flag),
-                         dev_ctx.template Alloc<T>(Out),
-                         batch_size,
-                         M * N,
-                         0);
-      }
-    }
-    return;
-  }
-
-  if (y_ndim == 1) {
-    const int N = Y.numel();
-    if (trans_x) {
-      PADDLE_ENFORCE_EQ(
-          x_dims[x_ndim - 2],
-          N,
-          phi::errors::InvalidArgument("Input(X) has error dim."
-                                       "X'dims[%d] must be equal to %d"
-                                       "But received X'dims[%d] is %d",
-                                       x_ndim - 2,
-                                       N,
-                                       x_ndim - 2,
-                                       x_dims[x_ndim - 2]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          x_dims[x_ndim - 1],
-          N,
-          phi::errors::InvalidArgument("Input(X) has error dim."
-                                       "X'dims[%d] must be equal to %d"
-                                       "But received X'dims[%d] is %d",
-                                       x_ndim - 1,
-                                       N,
-                                       x_ndim - 1,
-                                       x_dims[x_ndim - 1]));
-    }
-    std::vector<std::int64_t> out_dims(x_ndim - 1);
-    if (trans_x) {
-      std::copy_n(x_dims.cbegin(), x_ndim - 2, out_dims.begin());
-      out_dims.back() = x_dims.back();
-    } else {
-      std::copy_n(x_dims.cbegin(), x_ndim - 1, out_dims.begin());
-    }
-    Out->ResizeAndAllocate(common::make_ddim(out_dims));
-    dev_ctx.template Alloc<T>(Out);
-
-    if (trans_x) {
-      const int M = x_dims[x_ndim - 1];
-      const int batch_size = X.numel() / (M * N);
-      if (batch_size == 1) {
-        VLOG(3) << "MatMul's case 5";
-        blas.GEMV(true,
-                  N,
-                  M,
-                  static_cast<T>(1),
-                  x_data,
-                  y_data,
-                  static_cast<T>(flag),
-                  dev_ctx.template Alloc<T>(Out));
-      } else {
-        VLOG(3) << "MatMul's case 6";
-        blas.BatchedGEMM(CblasTrans,
-                         CblasNoTrans,
-                         M,
-                         1,
-                         N,
-                         static_cast<T>(1),
-                         x_data,
-                         y_data,
-                         static_cast<T>(flag),
-                         dev_ctx.template Alloc<T>(Out),
-                         batch_size,
-                         M * N,
-                         0);
-      }
-    } else {
-      const int M = X.numel() / N;
-      VLOG(3) << "MatMul's case 7";
-      blas.GEMV(false,
-                M,
-                N,
-                static_cast<T>(1),
-                x_data,
-                y_data,
-                static_cast<T>(flag),
-                dev_ctx.template Alloc<T>(Out));
-    }
-    return;
-  }
-
-  const int M = trans_x ? x_dims[x_ndim - 1] : x_dims[x_ndim - 2];
-  const int K = trans_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1];
-  if (trans_y) {
-    PADDLE_ENFORCE_EQ(
-        y_dims[y_ndim - 1],
-        K,
-        phi::errors::InvalidArgument("Input(Y) has error dim. "
-                                     "Y'dims[%d] must be equal to %d, "
-                                     "but received Y'dims[%d] is %d.",
-                                     y_ndim - 1,
-                                     K,
-                                     y_ndim - 1,
-                                     y_dims[y_ndim - 1]));
-  } else {
-    PADDLE_ENFORCE_EQ(
-        y_dims[y_ndim - 2],
-        K,
-        phi::errors::InvalidArgument("Input(Y) has error dim. "
-                                     "Y'dims[%d] must be equal to %d, "
-                                     "but received Y'dims[%d] is %d.",
-                                     y_ndim - 2,
-                                     K,
-                                     y_ndim - 2,
-                                     y_dims[y_ndim - 2]));
-  }
-  const int N = trans_y ? y_dims[y_ndim - 2] : y_dims[y_ndim - 1];
-  const int ndim = (std::max)(x_ndim, y_ndim);
-  std::vector<std::int64_t> x_broadcast_dims(ndim);
-  std::vector<std::int64_t> y_broadcast_dims(ndim);
-  std::vector<std::int64_t> out_broadcast_dims(ndim);
-
-  GetBroadcastFromDims(x_ndim - 2,
-                       x_dims.data(),
-                       y_ndim - 2,
-                       y_dims.data(),
-                       x_broadcast_dims.data(),
-                       y_broadcast_dims.data(),
-                       out_broadcast_dims.data());
-  out_broadcast_dims[ndim - 2] = M;
-  out_broadcast_dims[ndim - 1] = N;
-
-  Out->ResizeAndAllocate(common::make_ddim(out_broadcast_dims));
-  dev_ctx.template Alloc<T>(Out);
-
-  const int batch_dim = ndim - 2;
-  // broadcast message
-  const bool is_broadcast_dims =
-      !std::equal(x_broadcast_dims.cbegin(),
-                  x_broadcast_dims.cbegin() + batch_dim,
-                  y_broadcast_dims.cbegin());
-
-  const std::int64_t x_batch_size =
-      std::accumulate(x_broadcast_dims.cbegin(),
-                      x_broadcast_dims.cbegin() + batch_dim,
-                      1LL,
-                      std::multiplies<std::int64_t>());
-  const std::int64_t y_batch_size =
-      std::accumulate(y_broadcast_dims.cbegin(),
-                      y_broadcast_dims.cbegin() + batch_dim,
-                      1LL,
-                      std::multiplies<std::int64_t>());
-  const std::int64_t out_batch_size =
-      std::accumulate(out_broadcast_dims.cbegin(),
-                      out_broadcast_dims.cbegin() + batch_dim,
-                      1LL,
-                      std::multiplies<std::int64_t>());
-  if (out_batch_size == 0) return;
-  if (x_batch_size == 1 && y_batch_size == 1) {
-    VLOG(3) << "MatMul's case 8";
-    blas.GEMM(trans_x ? CblasTrans : CblasNoTrans,
-              trans_y ? CblasTrans : CblasNoTrans,
-              M,
-              N,
-              K,
-              static_cast<T>(1),
-              x_data,
-              y_data,
-              static_cast<T>(flag),
-              dev_ctx.template Alloc<T>(Out));
-  } else if (x_batch_size == 1) {
-    if (M == 1 && trans_y) {
-      VLOG(3) << "MatMul's case 9";
-      blas.GEMV(false,
-                y_batch_size * N,
-                K,
-                static_cast<T>(1),
-                y_data,
-                x_data,
-                static_cast<T>(flag),
-                dev_ctx.template Alloc<T>(Out));
-    } else {
-      VLOG(3) << "MatMul's case 10";
-      blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
-                       trans_y ? CblasTrans : CblasNoTrans,
-                       M,
-                       N,
-                       K,
-                       static_cast<T>(1),
-                       x_data,
-                       y_data,
-                       static_cast<T>(flag),
-                       dev_ctx.template Alloc<T>(Out),
-                       out_batch_size,
-                       0,
-                       K * N);
-    }
-  } else if (y_batch_size == 1) {
-    if (!trans_x) {
-      VLOG(3) << "MatMul's case 11";
-      blas.GEMM(CblasNoTrans,
-                trans_y ? CblasTrans : CblasNoTrans,
-                x_batch_size * M,
-                N,
-                K,
-                static_cast<T>(1),
-                x_data,
-                y_data,
-                static_cast<T>(flag),
-                dev_ctx.template Alloc<T>(Out));
-    } else {
-      VLOG(3) << "MatMul's case 12";
-      blas.BatchedGEMM(CblasTrans,
-                       trans_y ? CblasTrans : CblasNoTrans,
-                       M,
-                       N,
-                       K,
-                       static_cast<T>(1),
-                       x_data,
-                       y_data,
-                       static_cast<T>(flag),
-                       dev_ctx.template Alloc<T>(Out),
-                       out_batch_size,
-                       M * K,
-                       0);
-    }
-  } else if (!is_broadcast_dims) {
-    VLOG(3) << "MatMul's case 13";
-    blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
-                     trans_y ? CblasTrans : CblasNoTrans,
-                     M,
-                     N,
-                     K,
-                     static_cast<T>(1),
-                     x_data,
-                     y_data,
-                     static_cast<T>(flag),
-                     dev_ctx.template Alloc<T>(Out),
-                     out_batch_size,
-                     M * K,
-                     K * N);
-  } else {
-    // in the case, can't use stridedgemm
-    std::vector<const T*> x_ptr(out_batch_size);
-    std::vector<const T*> y_ptr(out_batch_size);
-    std::vector<T*> out_ptr(out_batch_size);
-    std::vector<std::int64_t> index(batch_dim, 0);
-    for (std::int64_t i = 0; i < out_batch_size; ++i) {
-      // using the index to get offset
-      const std::int64_t x_index =
-          GetIndexMessage(batch_dim, x_broadcast_dims.data(), index.data());
-      const std::int64_t y_index =
-          GetIndexMessage(batch_dim, y_broadcast_dims.data(), index.data());
-
-      x_ptr[i] = x_data + x_index * M * K;
-      y_ptr[i] = y_data + y_index * K * N;
-      out_ptr[i] = dev_ctx.template Alloc<T>(Out) + i * M * N;
-      IndexIncreaseFromDims(batch_dim, out_broadcast_dims.data(), index.data());
-    }
-    VLOG(3) << "MatMul's case 14";
-    blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
-                     trans_y ? CblasTrans : CblasNoTrans,
-                     M,
-                     N,
-                     K,
-                     static_cast<T>(1),
-                     x_ptr.data(),
-                     y_ptr.data(),
-                     static_cast<T>(flag),
-                     out_ptr.data(),
-                     out_batch_size);
-  }
-}
-
-#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060 && 0
-// This is almost a copy from MatMulFunctionImplWithBlas,
-// compare cublas with cublasLt kernels when Matmul autotune is on
-template <typename Context, typename T>
-void MatMulFunctionImplWithCublasLt(
-    const Context& dev_ctx,
-    const DenseTensor& X,
-    const DenseTensor& Y,
-    const std::vector<std::int64_t>& x_dims,
-    const std::vector<std::int64_t>& y_dims,
-    DenseTensor* Out,
-    bool trans_x,
-    bool trans_y,
-    bool flag = false,
-    phi::funcs::MatmulPlanner* matmul_planner = nullptr) {
-  const int x_ndim = x_dims.size();
-  const int y_ndim = y_dims.size();
-  const T* x_data = X.data<T>();
-  const T* y_data = Y.data<T>();
-  using blaslt = phi::funcs::MatmulWithCublasLt<T>;
-
-  if (x_ndim == 1 && y_ndim == 1) {
-    const int M = X.numel();
-    const int N = Y.numel();
-    PADDLE_ENFORCE_EQ(
-        M,
-        N,
-        phi::errors::InvalidArgument(
-            "X's numbers must be equal to Y's numbers,"
-            "when X/Y's dims =1. But received X has [%d] elements,"
-            "received Y has [%d] elements",
-            M,
-            N));
-
-    // MatMul's case 0  =>  vector * vector
-    Out->Resize(common::make_ddim({}));
-    dev_ctx.template Alloc<T>(Out);
-    VLOG(3) << "MatMul with blaslt case 1";
-    blaslt::Run(dev_ctx,
-                y_data,
-                x_data,
-                dev_ctx.template Alloc<T>(Out),
-                1,
-                1,
-                M,
-                false,
-                true,
-                matmul_planner);
-    return;
-  }
-
-  if (x_ndim == 1) {
-    const int N = X.numel();
-    if (trans_y) {
-      PADDLE_ENFORCE_EQ(
-          y_dims[y_ndim - 1],
-          N,
-          phi::errors::InvalidArgument("Input(Y) has error dim. "
-                                       "Y'dims[%d] must be equal to %d, "
-                                       "but received Y'dims[%d] is %d.",
-                                       y_ndim - 1,
-                                       N,
-                                       y_ndim - 1,
-                                       y_dims[y_ndim - 1]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          y_dims[y_ndim - 2],
-          N,
-          phi::errors::InvalidArgument("Input(Y) has error dim. "
-                                       "Y'dims[%d] must be equal to %d, "
-                                       "but received Y'dims[%d] is %d.",
-                                       y_ndim - 2,
-                                       N,
-                                       y_ndim - 2,
-                                       y_dims[y_ndim - 2]));
-    }
-    std::vector<std::int64_t> out_dims(y_ndim - 1);
-    if (trans_y) {
-      std::copy_n(y_dims.cbegin(), y_ndim - 1, out_dims.begin());
-    } else {
-      std::copy_n(y_dims.cbegin(), y_ndim - 2, out_dims.begin());
-      out_dims.back() = y_dims.back();
-    }
-    Out->ResizeAndAllocate(common::make_ddim(out_dims));
-    dev_ctx.template Alloc<T>(Out);
-    if (trans_y) {
-      const int M = Y.numel() / N;
-      VLOG(3) << "MatMul with blaslt 2";
-      blaslt::Run(dev_ctx,
-                  y_data,
-                  x_data,
-                  dev_ctx.template Alloc<T>(Out),
-                  M,
-                  1,
-                  N,
-                  false,
-                  false,
-                  matmul_planner);
-    } else {
-      const int M = y_dims[y_ndim - 1];
-      const int batch_size = Y.numel() / (M * N);
-      if (batch_size == 1) {
-        VLOG(3) << "MatMul with blaslt 3";
-        blaslt::Run(dev_ctx,
-                    y_data,
-                    x_data,
-                    dev_ctx.template Alloc<T>(Out),
-                    M,
-                    1,
-                    N,
-                    true,
-                    false,
-                    matmul_planner);
-      } else {
-        VLOG(3) << "MatMul with blaslt 4";
-        blaslt::RunWithBatch(dev_ctx,
-                             y_data,
-                             x_data,
-                             dev_ctx.template Alloc<T>(Out),
-                             M,
-                             1,
-                             N,
-                             true,
-                             false,
-                             batch_size,
-                             M * N,
-                             0,
-                             M,
-                             matmul_planner);
-      }
-    }
-    return;
-  }
-
-  if (y_ndim == 1) {
-    const int N = Y.numel();
-    if (trans_x) {
-      PADDLE_ENFORCE_EQ(
-          x_dims[x_ndim - 2],
-          N,
-          phi::errors::InvalidArgument("Input(X) has error dim."
-                                       "X'dims[%d] must be equal to %d"
-                                       "But received X'dims[%d] is %d",
-                                       x_ndim - 2,
-                                       N,
-                                       x_ndim - 2,
-                                       x_dims[x_ndim - 2]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          x_dims[x_ndim - 1],
-          N,
-          phi::errors::InvalidArgument("Input(X) has error dim."
-                                       "X'dims[%d] must be equal to %d"
-                                       "But received X'dims[%d] is %d",
-                                       x_ndim - 1,
-                                       N,
-                                       x_ndim - 1,
-                                       x_dims[x_ndim - 1]));
-    }
-    std::vector<std::int64_t> out_dims(x_ndim - 1);
-    if (trans_x) {
-      std::copy_n(x_dims.cbegin(), x_ndim - 2, out_dims.begin());
-      out_dims.back() = x_dims.back();
-    } else {
-      std::copy_n(x_dims.cbegin(), x_ndim - 1, out_dims.begin());
-    }
-    Out->ResizeAndAllocate(common::make_ddim(out_dims));
-    dev_ctx.template Alloc<T>(Out);
-
-    if (trans_x) {
-      const int M = x_dims[x_ndim - 1];
-      const int batch_size = X.numel() / (M * N);
-      if (batch_size == 1) {
-        VLOG(3) << "MatMul with blaslt 5";
-        blaslt::Run(dev_ctx,
-                    x_data,
-                    y_data,
-                    dev_ctx.template Alloc<T>(Out),
-                    M,
-                    1,
-                    N,
-                    true,
-                    false,
-                    matmul_planner);
-      } else {
-        VLOG(3) << "MatMul with blaslt 6";
-        blaslt::RunWithBatch(dev_ctx,
-                             x_data,
-                             y_data,
-                             dev_ctx.template Alloc<T>(Out),
-                             M,
-                             1,
-                             N,
-                             true,
-                             false,
-                             batch_size,
-                             M * N,
-                             0,
-                             M,
-                             matmul_planner);
-      }
-    } else {
-      const int M = X.numel() / N;
-      VLOG(3) << "MatMul with blaslt 7";
-      blaslt::Run(dev_ctx,
-                  x_data,
-                  y_data,
-                  dev_ctx.template Alloc<T>(Out),
-                  M,
-                  1,
-                  N,
-                  false,
-                  false,
-                  matmul_planner);
-    }
-    return;
-  }
-
-  const int M = trans_x ? x_dims[x_ndim - 1] : x_dims[x_ndim - 2];
-  const int K = trans_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1];
-  if (trans_y) {
-    PADDLE_ENFORCE_EQ(
-        y_dims[y_ndim - 1],
-        K,
-        phi::errors::InvalidArgument("Input(Y) has error dim. "
-                                     "Y'dims[%d] must be equal to %d, "
-                                     "but received Y'dims[%d] is %d.",
-                                     y_ndim - 1,
-                                     K,
-                                     y_ndim - 1,
-                                     y_dims[y_ndim - 1]));
-  } else {
-    PADDLE_ENFORCE_EQ(
-        y_dims[y_ndim - 2],
-        K,
-        phi::errors::InvalidArgument("Input(Y) has error dim. "
-                                     "Y'dims[%d] must be equal to %d, "
-                                     "but received Y'dims[%d] is %d.",
-                                     y_ndim - 2,
-                                     K,
-                                     y_ndim - 2,
-                                     y_dims[y_ndim - 2]));
-  }
-  const int N = trans_y ? y_dims[y_ndim - 2] : y_dims[y_ndim - 1];
-  const int ndim = (std::max)(x_ndim, y_ndim);
-  std::vector<std::int64_t> x_broadcast_dims(ndim);
-  std::vector<std::int64_t> y_broadcast_dims(ndim);
-  std::vector<std::int64_t> out_broadcast_dims(ndim);
-
-  GetBroadcastFromDims(x_ndim - 2,
-                       x_dims.data(),
-                       y_ndim - 2,
-                       y_dims.data(),
-                       x_broadcast_dims.data(),
-                       y_broadcast_dims.data(),
-                       out_broadcast_dims.data());
-  out_broadcast_dims[ndim - 2] = M;
-  out_broadcast_dims[ndim - 1] = N;
-
-  Out->ResizeAndAllocate(common::make_ddim(out_broadcast_dims));
-  dev_ctx.template Alloc<T>(Out);
-
-  const int batch_dim = ndim - 2;
-  // broadcast message
-  const bool is_broadcast_dims =
-      !std::equal(x_broadcast_dims.cbegin(),
-                  x_broadcast_dims.cbegin() + batch_dim,
-                  y_broadcast_dims.cbegin());
-
-  const std::int64_t x_batch_size =
-      std::accumulate(x_broadcast_dims.cbegin(),
-                      x_broadcast_dims.cbegin() + batch_dim,
-                      1LL,
-                      std::multiplies<std::int64_t>());
-  const std::int64_t y_batch_size =
-      std::accumulate(y_broadcast_dims.cbegin(),
-                      y_broadcast_dims.cbegin() + batch_dim,
-                      1LL,
-                      std::multiplies<std::int64_t>());
-  const std::int64_t out_batch_size =
-      std::accumulate(out_broadcast_dims.cbegin(),
-                      out_broadcast_dims.cbegin() + batch_dim,
-                      1LL,
-                      std::multiplies<std::int64_t>());
-  if (out_batch_size == 0) return;
-  if (x_batch_size == 1 && y_batch_size == 1) {
-    VLOG(3) << "MatMul with blaslt 8";
-    blaslt::Run(dev_ctx,
-                x_data,
-                y_data,
-                dev_ctx.template Alloc<T>(Out),
-                M,
-                N,
-                K,
-                trans_x,
-                trans_y,
-                matmul_planner);
-  } else if (x_batch_size == 1) {
-    if (M == 1 && trans_y) {
-      VLOG(3) << "MatMul with blaslt 9";
-      blaslt::Run(dev_ctx,
-                  y_data,
-                  x_data,
-                  dev_ctx.template Alloc<T>(Out),
-                  y_batch_size * N,
-                  1,
-                  K,
-                  false,
-                  false,
-                  matmul_planner);
-    } else {
-      VLOG(3) << "MatMul with blaslt 10";
-      blaslt::RunWithBatch(dev_ctx,
-                           x_data,
-                           y_data,
-                           dev_ctx.template Alloc<T>(Out),
-                           M,
-                           N,
-                           K,
-                           trans_x,
-                           trans_y,
-                           out_batch_size,
-                           0,
-                           K * N,
-                           M * N,
-                           matmul_planner);
-    }
-  } else if (y_batch_size == 1) {
-    if (!trans_x) {
-      VLOG(3) << "MatMul with blaslt 11";
-      blaslt::Run(dev_ctx,
-                  x_data,
-                  y_data,
-                  dev_ctx.template Alloc<T>(Out),
-                  x_batch_size * M,
-                  N,
-                  K,
-                  false,
-                  trans_y,
-                  matmul_planner);
-    } else {
-      VLOG(3) << "MatMul with blaslt 12";
-      blaslt::RunWithBatch(dev_ctx,
-                           x_data,
-                           y_data,
-                           dev_ctx.template Alloc<T>(Out),
-                           M,
-                           N,
-                           K,
-                           true,
-                           trans_y,
-                           out_batch_size,
-                           M * K,
-                           0,
-                           M * N,
-                           matmul_planner);
-    }
-  } else if (!is_broadcast_dims) {
-    VLOG(3) << "MatMul with blaslt 13";
-    blaslt::RunWithBatch(dev_ctx,
-                         x_data,
-                         y_data,
-                         dev_ctx.template Alloc<T>(Out),
-                         M,
-                         N,
-                         K,
-                         trans_x,
-                         trans_y,
-                         out_batch_size,
-                         M * K,
-                         K * N,
-                         M * N,
-                         matmul_planner);
-  } else {
-    // in the case, can't use stridedgemm
-    std::vector<const T*> x_ptr(out_batch_size);
-    std::vector<const T*> y_ptr(out_batch_size);
-    std::vector<T*> out_ptr(out_batch_size);
-    std::vector<std::int64_t> index(batch_dim, 0);
-    for (std::int64_t i = 0; i < out_batch_size; ++i) {
-      // using the index to get offset
-      const std::int64_t x_index =
-          GetIndexMessage(batch_dim, x_broadcast_dims.data(), index.data());
-      const std::int64_t y_index =
-          GetIndexMessage(batch_dim, y_broadcast_dims.data(), index.data());
-
-      x_ptr[i] = x_data + x_index * M * K;
-      y_ptr[i] = y_data + y_index * K * N;
-      out_ptr[i] = dev_ctx.template Alloc<T>(Out) + i * M * N;
-      IndexIncreaseFromDims(batch_dim, out_broadcast_dims.data(), index.data());
-    }
-    VLOG(3) << "MatMul with blaslt 14";
-    blaslt::RunWithBatch(dev_ctx,
-                         x_ptr.data(),
-                         y_ptr.data(),
-                         out_ptr.data(),
-                         M,
-                         N,
-                         K,
-                         trans_x,
-                         trans_y,
-                         out_batch_size,
-                         matmul_planner);
-  }
-}
-#endif
-
-template <typename Context, typename T>
-struct MatMulDispatcher {
-  void operator()(const Context& ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  const std::vector<std::int64_t>& x_dims,
-                  const std::vector<std::int64_t>& y_dims,
-                  DenseTensor* out,
-                  bool trans_x,
-                  bool trans_y,
-                  bool flag = false) {
-    MatMulFunctionImplWithBlas<Context, T>(
-        ctx, x, y, x_dims, y_dims, out, trans_x, trans_y, flag);
-  }
-};
-
-#ifdef PADDLE_WITH_CUDA
-template <typename T>
-struct MatMulDispatcher<phi::GPUContext, T> {
-  void operator()(const phi::GPUContext& ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  const std::vector<std::int64_t>& x_dims,
-                  const std::vector<std::int64_t>& y_dims,
-                  DenseTensor* out,
-                  bool trans_x,
-                  bool trans_y,
-                  bool flag = false) {
-#if CUDA_VERSION >= 11060 && 0
-    auto* tuner = phi::autotune::MakeMatmulTuner<T>(
-        MatMulFunctionImplWithBlas<phi::GPUContext, T>);
-    tuner->AddCallBack(MatMulFunctionImplWithCublasLt<phi::GPUContext, T>);
-    phi::funcs::MatmulPlanner matmul_planner(x_dims,
-                                             y_dims,
-                                             trans_x,
-                                             trans_y,
-                                             phi::CppTypeToDataType<T>::Type(),
-                                             funcs::MatmulFusedType::kMatmul,
-                                             /* bias_data */ nullptr,
-                                             /* reserve_data */ nullptr,
-                                             /* use_addto */ flag,
-                                             /* no_exchange */ true);
-    tuner->Run(ctx,
-               matmul_planner.GetKey(),
-               ctx,
-               x,
-               y,
-               x_dims,
-               y_dims,
-               out,
-               trans_x,
-               trans_y,
-               flag,
-               &matmul_planner);
-#else
-    MatMulFunctionImplWithBlas<phi::GPUContext, T>(
-        ctx, x, y, x_dims, y_dims, out, trans_x, trans_y, flag);
-#endif
-  }
-};
-
-#endif  // PADDLE_WITH_CUDA
-
-template <typename Context, typename T>
-void MatMulFunction(const Context& ctx,
-                    const DenseTensor& x,
-                    const DenseTensor& y,
-                    const std::vector<std::int64_t>& x_dims,
-                    const std::vector<std::int64_t>& y_dims,
-                    DenseTensor* out,
-                    bool trans_x,
-                    bool trans_y,
-                    bool flag = false) {
-  MatMulDispatcher<Context, T>()(
-      ctx, x, y, x_dims, y_dims, out, trans_x, trans_y, flag);
-}
-
-template <typename Context>
-bool MatMulInt8Function(const Context& ctx,
-                        const DenseTensor& x,
-                        const DenseTensor& y,
-                        const std::vector<std::int64_t>& x_dims,
-                        const std::vector<std::int64_t>& y_dims,
-                        DenseTensor* out,
-                        bool trans_x,
-                        bool trans_y) {
-  return false;
-}
-
-#ifdef PADDLE_WITH_CUDA
-template <>
-bool inline MatMulInt8Function(const phi::GPUContext& ctx,
-                               const DenseTensor& x,
-                               const DenseTensor& y,
-                               const std::vector<std::int64_t>& x_dims,
-                               const std::vector<std::int64_t>& y_dims,
-                               DenseTensor* out,
-                               bool trans_x,
-                               bool trans_y) {
-  if (x.dtype() != DataType::INT8 || y.dtype() != DataType::INT8) {
-    return false;
-  }
-#if CUDA_VERSION >= 11060 && 0
-  const int x_ndim = x_dims.size();
-  const int y_ndim = y_dims.size();
-  const int8_t* x_data = x.data<int8_t>();
-  const int8_t* y_data = y.data<int8_t>();
-  using blaslt = phi::funcs::MatmulWithCublasLt<int8_t, int32_t>;
-
-  phi::funcs::MatmulPlanner matmul_planner(
-      x_dims,
-      y_dims,
-      trans_x,
-      trans_y,
-      phi::CppTypeToDataType<int8_t>::Type(),
-      funcs::MatmulFusedType::kMatmul,
-      /* bias_data */ nullptr,
-      /* reserve_data */ nullptr,
-      /* use_addto */ false,
-      /* no_exchange */ true);
-
-  if (x_ndim == 1 && y_ndim == 1) {
-    const int M = x.numel();
-    const int N = y.numel();
-    PADDLE_ENFORCE_EQ(
-        M,
-        N,
-        phi::errors::InvalidArgument(
-            "X's numbers must be equal to Y's numbers,"
-            "when X/Y's dims =1. But received X has [%d] elements,"
-            "received Y has [%d] elements",
-            M,
-            N));
-    if (!(M % 4 == 0)) {
-      return false;
-    }
-
-    out->Resize(common::make_ddim({}));
-    ctx.template Alloc<int32_t>(out);
-    blaslt::Run(ctx,
-                y_data,
-                x_data,
-                ctx.template Alloc<int32_t>(out),
-                1,
-                1,
-                M,
-                false,
-                true,
-                &matmul_planner);
-    return true;
-  }
-  if (x_ndim == 1) {
-    const int N = x.numel();
-    if (trans_y) {
-      PADDLE_ENFORCE_EQ(
-          y_dims[y_ndim - 1],
-          N,
-          phi::errors::InvalidArgument("Input(Y) has error dim. "
-                                       "Y'dims[%d] must be equal to %d, "
-                                       "but received Y'dims[%d] is %d.",
-                                       y_ndim - 1,
-                                       N,
-                                       y_ndim - 1,
-                                       y_dims[y_ndim - 1]));
-      if (!(N % 4 == 0)) {
-        return false;
-      }
-    } else {
-      PADDLE_ENFORCE_EQ(
-          y_dims[y_ndim - 2],
-          N,
-          phi::errors::InvalidArgument("Input(Y) has error dim. "
-                                       "Y'dims[%d] must be equal to %d, "
-                                       "but received Y'dims[%d] is %d.",
-                                       y_ndim - 2,
-                                       N,
-                                       y_ndim - 2,
-                                       y_dims[y_ndim - 2]));
-      const int M = y.numel() / N;
-      if (!(M == 1 || M % 4 == 0)) {
-        return false;
-      }
-    }
-    std::vector<std::int64_t> out_dims(y_ndim - 1);
-    if (trans_y) {
-      std::copy_n(y_dims.cbegin(), y_ndim - 1, out_dims.begin());
-    } else {
-      std::copy_n(y_dims.cbegin(), y_ndim - 2, out_dims.begin());
-      out_dims.back() = y_dims.back();
-    }
-    out->ResizeAndAllocate(common::make_ddim(out_dims));
-    ctx.template Alloc<int32_t>(out);
-    if (trans_y) {
-      const int M = y.numel() / N;
-      blaslt::Run(ctx,
-                  y_data,
-                  x_data,
-                  ctx.template Alloc<int32_t>(out),
-                  M,
-                  1,
-                  N,
-                  false,
-                  false,
-                  &matmul_planner);
-    } else {
-      const int M = y_dims[y_ndim - 1];
-      const int batch_size = y.numel() / (M * N);
-      if (batch_size == 1) {
-        blaslt::Run(ctx,
-                    y_data,
-                    x_data,
-                    ctx.template Alloc<int32_t>(out),
-                    M,
-                    1,
-                    N,
-                    true,
-                    false,
-                    &matmul_planner);
-      } else {
-        blaslt::RunWithBatch(ctx,
-                             y_data,
-                             x_data,
-                             ctx.template Alloc<int32_t>(out),
-                             M,
-                             1,
-                             N,
-                             true,
-                             false,
-                             batch_size,
-                             M * N,
-                             0,
-                             M,
-                             &matmul_planner);
-      }
-    }
-    return true;
-  }
-
-  if (y_ndim == 1) {
-    const int N = y.numel();
-    if (trans_x) {
-      PADDLE_ENFORCE_EQ(
-          x_dims[x_ndim - 2],
-          N,
-          phi::errors::InvalidArgument("Input(X) has error dim."
-                                       "X'dims[%d] must be equal to %d"
-                                       "But received X'dims[%d] is %d",
-                                       x_ndim - 2,
-                                       N,
-                                       x_ndim - 2,
-                                       x_dims[x_ndim - 2]));
-      const int M = x.numel() / N;
-      if (!((M == 1 || M % 4 == 0))) {
-        return false;
-      }
-    } else {
-      PADDLE_ENFORCE_EQ(
-          x_dims[x_ndim - 1],
-          N,
-          phi::errors::InvalidArgument("Input(X) has error dim."
-                                       "X'dims[%d] must be equal to %d"
-                                       "But received X'dims[%d] is %d",
-                                       x_ndim - 1,
-                                       N,
-                                       x_ndim - 1,
-                                       x_dims[x_ndim - 1]));
-      if (N % 4 != 0) {
-        return false;
-      }
-    }
-    std::vector<std::int64_t> out_dims(x_ndim - 1);
-    if (trans_x) {
-      std::copy_n(x_dims.cbegin(), x_ndim - 2, out_dims.begin());
-      out_dims.back() = x_dims.back();
-    } else {
-      std::copy_n(x_dims.cbegin(), x_ndim - 1, out_dims.begin());
-    }
-    out->ResizeAndAllocate(common::make_ddim(out_dims));
-    ctx.template Alloc<int32_t>(out);
-
-    if (trans_x) {
-      const int M = x_dims[x_ndim - 1];
-      const int batch_size = x.numel() / (M * N);
-      if (batch_size == 1) {
-        blaslt::Run(ctx,
-                    x_data,
-                    y_data,
-                    ctx.template Alloc<int32_t>(out),
-                    M,
-                    1,
-                    N,
-                    true,
-                    false,
-                    &matmul_planner);
-      } else {
-        blaslt::RunWithBatch(ctx,
-                             x_data,
-                             y_data,
-                             ctx.template Alloc<int32_t>(out),
-                             M,
-                             1,
-                             N,
-                             true,
-                             false,
-                             batch_size,
-                             M * N,
-                             0,
-                             M,
-                             &matmul_planner);
-      }
-    } else {
-      const int M = x.numel() / N;
-      blaslt::Run(ctx,
-                  x_data,
-                  y_data,
-                  ctx.template Alloc<int32_t>(out),
-                  M,
-                  1,
-                  N,
-                  false,
-                  false,
-                  &matmul_planner);
-    }
-    return true;
-  }
-
-  const int M = trans_x ? x_dims[x_ndim - 1] : x_dims[x_ndim - 2];
-  const int K = trans_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1];
-  if (trans_y) {
-    PADDLE_ENFORCE_EQ(
-        y_dims[y_ndim - 1],
-        K,
-        phi::errors::InvalidArgument("Input(Y) has error dim. "
-                                     "Y'dims[%d] must be equal to %d, "
-                                     "but received Y'dims[%d] is %d.",
-                                     y_ndim - 1,
-                                     K,
-                                     y_ndim - 1,
-                                     y_dims[y_ndim - 1]));
-  } else {
-    PADDLE_ENFORCE_EQ(
-        y_dims[y_ndim - 2],
-        K,
-        phi::errors::InvalidArgument("Input(Y) has error dim. "
-                                     "Y'dims[%d] must be equal to %d, "
-                                     "but received Y'dims[%d] is %d.",
-                                     y_ndim - 2,
-                                     K,
-                                     y_ndim - 2,
-                                     y_dims[y_ndim - 2]));
-  }
-  const int N = trans_y ? y_dims[y_ndim - 2] : y_dims[y_ndim - 1];
-  const int ndim = (std::max)(x_ndim, y_ndim);
-  std::vector<std::int64_t> x_broadcast_dims(ndim);
-  std::vector<std::int64_t> y_broadcast_dims(ndim);
-  std::vector<std::int64_t> out_broadcast_dims(ndim);
-  GetBroadcastFromDims(x_ndim - 2,
-                       x_dims.data(),
-                       y_ndim - 2,
-                       y_dims.data(),
-                       x_broadcast_dims.data(),
-                       y_broadcast_dims.data(),
-                       out_broadcast_dims.data());
-  out_broadcast_dims[ndim - 2] = M;
-  out_broadcast_dims[ndim - 1] = N;
-
-  out->ResizeAndAllocate(common::make_ddim(out_broadcast_dims));
-  ctx.template Alloc<int32_t>(out);
-
-  const int batch_dim = ndim - 2;
-  // broadcast message
-  const bool is_broadcast_dims =
-      !std::equal(x_broadcast_dims.cbegin(),
-                  x_broadcast_dims.cbegin() + batch_dim,
-                  y_broadcast_dims.cbegin());
-
-  const std::int64_t x_batch_size =
-      std::accumulate(x_broadcast_dims.cbegin(),
-                      x_broadcast_dims.cbegin() + batch_dim,
-                      1LL,
-                      std::multiplies<std::int64_t>());
-  const std::int64_t y_batch_size =
-      std::accumulate(y_broadcast_dims.cbegin(),
-                      y_broadcast_dims.cbegin() + batch_dim,
-                      1LL,
-                      std::multiplies<std::int64_t>());
-  const std::int64_t out_batch_size =
-      std::accumulate(out_broadcast_dims.cbegin(),
-                      out_broadcast_dims.cbegin() + batch_dim,
-                      1LL,
-                      std::multiplies<std::int64_t>());
-  if (out_batch_size == 0) return true;
-
-  if (x_batch_size == 1 && M == 1 && trans_y) {
-    if (!(K % 4 == 0)) {
-      return false;
-    }
-  } else if (!trans_x && !trans_y) {
-    if (!(N % 4 == 0 || N == 1) || !(K % 4 == 0) || (M == 1 && N == 1)) {
-      return false;
-    }
-  } else if (!trans_x && trans_y) {
-    if (!(K % 4 == 0)) {
-      return false;
-    }
-  } else if (trans_x && !trans_y) {
-    if (!(M % 4 == 0 || M == 1) || !(N % 4 == 0 || N == 1)) {
-      return false;
-    }
-  } else {
-    if (!(M % 4 == 0 || M == 1) || !(K % 4 == 0)) {
-      return false;
-    }
-  }
-  if (x_batch_size == 1 && y_batch_size == 1) {
-    blaslt::Run(ctx,
-                x_data,
-                y_data,
-                ctx.template Alloc<int32_t>(out),
-                M,
-                N,
-                K,
-                trans_x,
-                trans_y,
-                &matmul_planner);
-  } else if (x_batch_size == 1) {
-    if (M == 1 && trans_y) {
-      blaslt::Run(ctx,
-                  y_data,
-                  x_data,
-                  ctx.template Alloc<int32_t>(out),
-                  y_batch_size * N,
-                  1,
-                  K,
-                  false,
-                  false,
-                  &matmul_planner);
-    } else {
-      blaslt::RunWithBatch(ctx,
-                           x_data,
-                           y_data,
-                           ctx.template Alloc<int32_t>(out),
-                           M,
-                           N,
-                           K,
-                           trans_x,
-                           trans_y,
-                           out_batch_size,
-                           0,
-                           K * N,
-                           M * N,
-                           &matmul_planner);
-    }
-  } else if (y_batch_size == 1) {
-    if (!trans_x) {
-      blaslt::Run(ctx,
-                  x_data,
-                  y_data,
-                  ctx.template Alloc<int32_t>(out),
-                  x_batch_size * M,
-                  N,
-                  K,
-                  false,
-                  trans_y,
-                  &matmul_planner);
-    } else {
-      blaslt::RunWithBatch(ctx,
-                           x_data,
-                           y_data,
-                           ctx.template Alloc<int32_t>(out),
-                           M,
-                           N,
-                           K,
-                           true,
-                           trans_y,
-                           out_batch_size,
-                           M * K,
-                           0,
-                           M * N,
-                           &matmul_planner);
-    }
-  } else if (!is_broadcast_dims) {
-    blaslt::RunWithBatch(ctx,
-                         x_data,
-                         y_data,
-                         ctx.template Alloc<int32_t>(out),
-                         M,
-                         N,
-                         K,
-                         trans_x,
-                         trans_y,
-                         out_batch_size,
-                         M * K,
-                         K * N,
-                         M * N,
-                         &matmul_planner);
-  } else {
-    // in the case, can't use stridedgemm
-    std::vector<const int8_t*> x_ptr(out_batch_size);
-    std::vector<const int8_t*> y_ptr(out_batch_size);
-    std::vector<int32_t*> out_ptr(out_batch_size);
-    std::vector<std::int64_t> index(batch_dim, 0);
-    for (std::int64_t i = 0; i < out_batch_size; ++i) {
-      // using the index to get offset
-      const std::int64_t x_index =
-          GetIndexMessage(batch_dim, x_broadcast_dims.data(), index.data());
-      const std::int64_t y_index =
-          GetIndexMessage(batch_dim, y_broadcast_dims.data(), index.data());
-
-      x_ptr[i] = x_data + x_index * M * K;
-      y_ptr[i] = y_data + y_index * K * N;
-      out_ptr[i] = ctx.template Alloc<int32_t>(out) + i * M * N;
-      IndexIncreaseFromDims(batch_dim, out_broadcast_dims.data(), index.data());
-    }
-    blaslt::RunWithBatch(ctx,
-                         x_ptr.data(),
-                         y_ptr.data(),
-                         out_ptr.data(),
-                         M,
-                         N,
-                         K,
-                         trans_x,
-                         trans_y,
-                         out_batch_size,
-                         &matmul_planner);
-  }
-  return true;
-#else
-  return false;
-#endif
-}
-#endif
-
-template <typename Context, typename T>
-typename std::enable_if<std::is_integral<T>::value>::type
-MatmulJudgeDtypeKernel(const Context& ctx,
-                       const DenseTensor& x,
-                       const DenseTensor& y,
-                       const std::vector<std::int64_t>& x_dims,
-                       const std::vector<std::int64_t>& y_dims,
-                       DenseTensor* out,
-                       bool transpose_x,
-                       bool transpose_y) {
-  bool try_matmul_int8 = MatMulInt8Function<Context>(
-      ctx, x, y, x_dims, y_dims, out, transpose_x, transpose_y);
-  if (try_matmul_int8) {
-    return;
-  }
-  auto x_tmp = phi::Cast<T, Context>(ctx, x, phi::DataType::FLOAT32);
-  auto y_tmp = phi::Cast<T, Context>(ctx, y, phi::DataType::FLOAT32);
-  DenseTensor out_tmp;
-  MatMulFunction<Context, float>(
-      ctx, x_tmp, y_tmp, x_dims, y_dims, &out_tmp, transpose_x, transpose_y);
-  if (x.dtype() == phi::DataType::INT8) {
-    phi::CastKernel<float>(ctx, out_tmp, phi::DataType::INT32, out);
-    return;
-  }
-  phi::CastKernel<float>(ctx, out_tmp, x.dtype(), out);
-}
-
-template <typename Context, typename T>
-typename std::enable_if<!std::is_integral<T>::value>::type
-MatmulJudgeDtypeKernel(const Context& ctx,
-                       const DenseTensor& x,
-                       const DenseTensor& y,
-                       const std::vector<std::int64_t>& x_dims,
-                       const std::vector<std::int64_t>& y_dims,
-                       DenseTensor* out,
-                       bool transpose_x,
-                       bool transpose_y) {
-  MatMulFunction<Context, T>(
-      ctx, x, y, x_dims, y_dims, out, transpose_x, transpose_y);
-}
-
-template <typename T, typename Context>
-void MatmulKernel(const Context& ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  bool transpose_x,
-                  bool transpose_y,
-                  DenseTensor* out) {
-  PADDLE_ENFORCE_NE(
-      common::product(x.dims()),
-      0,
-      phi::errors::InvalidArgument("The Input(X) dims size must not be equal 0,"
-                                   " but reviced dims size is 0. "));
-  PADDLE_ENFORCE_NE(
-      common::product(y.dims()),
-      0,
-      phi::errors::InvalidArgument("The Input(Y) dims size must not be equal 0,"
-                                   " but reviced dims size is 0. "));
-  const std::vector<std::int64_t> x_dims = common::vectorize(x.dims());
-  const std::vector<std::int64_t> y_dims = common::vectorize(y.dims());
-  MatmulJudgeDtypeKernel<Context, T>(
-      ctx, x, y, x_dims, y_dims, out, transpose_x, transpose_y);
-}
-
-template <typename T, typename Context>
-void MatmulWithFlattenKernelImpl(const Context& dev_ctx,
-                                 const DenseTensor& x,
-                                 const DenseTensor& y,
-                                 int x_num_col_dims,
-                                 int y_num_col_dims,
-                                 DenseTensor* out) {
-  const DenseTensor x_matrix =
-      x.dims().size() > 2 ? phi::ReshapeToMatrix(x, x_num_col_dims) : x;
-  const DenseTensor y_matrix =
-      y.dims().size() > 2 ? phi::ReshapeToMatrix(y, y_num_col_dims) : y;
-
-  dev_ctx.template Alloc<T>(out);
-  auto z_dim = out->dims();
-  if (z_dim.size() != 2) {
-    out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
-  }
-
-  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
-
-  blas.MatMul(x_matrix, y_matrix, out);
-  if (z_dim.size() != 2) {
-    out->Resize(z_dim);
-  }
-}
-
-#ifdef PADDLE_WITH_CUDA
-
-template <typename Context>
-void MatmulWithFlattenKernelInt8Impl(const Context& dev_ctx,
-                                     const DenseTensor& x,
-                                     const DenseTensor& y,
-                                     int x_num_col_dims,
-                                     int y_num_col_dims,
-                                     DenseTensor* out) {
-  PADDLE_ENFORCE_EQ(
-      x.dtype(),
-      DataType::INT8,
-      phi::errors::InvalidArgument(
-          "The type of input(x) used in int8 mul must be (%s) "
-          "does not match the "
-          "type of data (%s) currently contained in the container.",
-          phi::CppTypeToDataType<int8_t>::Type(),
-          x.dtype()));
-  PADDLE_ENFORCE_EQ(
-      y.dtype(),
-      DataType::INT8,
-      phi::errors::InvalidArgument(
-          "The type of input(y) used in int8 mul must be (%s) "
-          "does not match the "
-          "type of data (%s) currently contained in the container.",
-          phi::CppTypeToDataType<int8_t>::Type(),
-          y.dtype()));
-
-  const DenseTensor x_matrix =
-      x.dims().size() > 2 ? phi::ReshapeToMatrix(x, x_num_col_dims) : x;
-  const DenseTensor y_matrix =
-      y.dims().size() > 2 ? phi::ReshapeToMatrix(y, y_num_col_dims) : y;
-
-  PADDLE_ENFORCE_EQ(
-      x_matrix.dims()[1],
-      y_matrix.dims()[0],
-      phi::errors::InvalidArgument(
-          "X's numbers of columns must be equal to Y's numbers of rows."
-          "But received X has [%d] columns,"
-          "received Y has [%d] rows",
-          x_matrix.dims()[1],
-          y_matrix.dims()[0]));
-
-  PADDLE_ENFORCE_EQ((y_matrix.dims()[1] % 4 == 0 || y_matrix.dims()[1] == 1),
-                    true,
-                    phi::errors::InvalidArgument(
-                        "The dimension size N used in int8 mul must be 1"
-                        "or a multiple of 4 does not match the size (%d)"
-                        "currently contained in the container.",
-                        y_matrix.dims()[1]));
-  PADDLE_ENFORCE_EQ((x_matrix.dims()[1] % 4 == 0),
-                    true,
-                    phi::errors::InvalidArgument(
-                        "The dimension size K used in int8 mul must be a"
-                        "multiple of 4 does not match the size (%d) currently"
-                        "contained in the container.",
-                        x_matrix.dims()[1]));
-
-  dev_ctx.template Alloc<int32_t>(out);
-  auto z_dim = out->dims();
-  if (z_dim.size() != 2) {
-    out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
-  }
-
-#if CUDA_VERSION >= 11060 && 0
-  using blaslt = phi::funcs::MatmulWithCublasLt<int8_t, int32_t>;
-
-  const int8_t* x_data = x_matrix.data<int8_t>();
-  const int8_t* y_data = y_matrix.data<int8_t>();
-
-  std::vector<std::int64_t> x_dims = {x_matrix.dims()[0], x_matrix.dims()[1]};
-  std::vector<std::int64_t> y_dims = {y_matrix.dims()[0], y_matrix.dims()[1]};
-  phi::funcs::MatmulPlanner matmul_planner(
-      x_dims,
-      y_dims,
-      false,
-      false,
-      phi::CppTypeToDataType<int8_t>::Type(),
-      funcs::MatmulFusedType::kMatmul,
-      /* bias_data */ nullptr,
-      /* reserve_data */ nullptr,
-      /* use_addto */ false,
-      /* no_exchange */ true);
-
-  blaslt::Run(dev_ctx,
-              x_data,
-              y_data,
-              dev_ctx.template Alloc<int32_t>(out),
-              x_matrix.dims()[0],
-              y_matrix.dims()[1],
-              x_matrix.dims()[1],
-              false,
-              false,
-              &matmul_planner);
-
-  if (z_dim.size() != 2) {
-    out->Resize(z_dim);
-  }
-#endif
-}
-#endif
-
-#ifdef PADDLE_WITH_CUDA
-template <typename Context>
-typename std::enable_if<std::is_same<Context, phi::GPUContext>::value,
-                        void>::type
-DispatchMatmulWithFlattenInt8Kernel(const phi::GPUContext& dev_ctx,
-                                    const DenseTensor& x,
-                                    const DenseTensor& y,
-                                    int x_num_col_dims,
-                                    int y_num_col_dims,
-                                    DenseTensor* out) {
-  MatmulWithFlattenKernelInt8Impl<Context>(
-      dev_ctx, x, y, x_num_col_dims, y_num_col_dims, out);
-}
-#endif
-
-template <typename Context>
-typename std::enable_if<std::is_same<Context, phi::CPUContext>::value,
-                        void>::type
-DispatchMatmulWithFlattenInt8Kernel(const phi::CPUContext& dev_ctx,
-                                    const DenseTensor& x,
-                                    const DenseTensor& y,
-                                    int x_num_col_dims,
-                                    int y_num_col_dims,
-                                    DenseTensor* out) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      "MatmulWithFlatten with CPU is NOT implemented "
-      "yet."));
-}
-
-template <typename T, typename Context>
-typename std::enable_if<std::is_same<T, int8_t>::value, void>::type
-DispatchMatmulFlattenKernel(const Context& dev_ctx,
-                            const DenseTensor& x,
-                            const DenseTensor& y,
-                            int x_num_col_dims,
-                            int y_num_col_dims,
-                            DenseTensor* out) {
-  DispatchMatmulWithFlattenInt8Kernel<Context>(
-      dev_ctx, x, y, x_num_col_dims, y_num_col_dims, out);
-}
-
-template <typename T, typename Context>
-typename std::enable_if<!std::is_same<T, int8_t>::value, void>::type
-DispatchMatmulFlattenKernel(const Context& dev_ctx,
-                            const DenseTensor& x,
-                            const DenseTensor& y,
-                            int x_num_col_dims,
-                            int y_num_col_dims,
-                            DenseTensor* out) {
-  MatmulWithFlattenKernelImpl<T, Context>(
-      dev_ctx, x, y, x_num_col_dims, y_num_col_dims, out);
-}
-
-template <typename T, typename Context>
-void MatmulWithFlattenKernel(const Context& dev_ctx,
-                             const DenseTensor& x,
-                             const DenseTensor& y,
-                             int x_num_col_dims,
-                             int y_num_col_dims,
-                             DenseTensor* out) {
-  DispatchMatmulFlattenKernel<T, Context>(
-      dev_ctx, x, y, x_num_col_dims, y_num_col_dims, out);
-}
-
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/impl/multi_dot_kernel_impl.h b/backends/metax_gpu/kernels/impl/multi_dot_kernel_impl.h
index aaa7fbd8d2c..7ba97234cc1 100644
--- a/backends/metax_gpu/kernels/impl/multi_dot_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/multi_dot_kernel_impl.h
@@ -14,9 +14,9 @@
 
 #pragma once
 
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 namespace phi {
 
 template <typename Context, typename T>
diff --git a/backends/metax_gpu/kernels/impl/mv_kernel_impl.h b/backends/metax_gpu/kernels/impl/mv_kernel_impl.h
index a87d431e250..4baee25a099 100644
--- a/backends/metax_gpu/kernels/impl/mv_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/mv_kernel_impl.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 
 namespace phi {
 
diff --git a/backends/metax_gpu/kernels/impl/solve_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/solve_grad_kernel_impl.h
index 860bce2cba5..1dd276dde2f 100644
--- a/backends/metax_gpu/kernels/impl/solve_grad_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/solve_grad_kernel_impl.h
@@ -14,11 +14,11 @@ limitations under the License. */
 
 #pragma once
 
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/kernels/expand_as_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/matrix_solve.h"
 #include "paddle/phi/kernels/funcs/reduce_function.h"
diff --git a/backends/metax_gpu/kernels/impl/triangular_solve_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/triangular_solve_grad_kernel_impl.h
index 08138853099..ad656b7a6c8 100644
--- a/backends/metax_gpu/kernels/impl/triangular_solve_grad_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/triangular_solve_grad_kernel_impl.h
@@ -14,10 +14,10 @@
 
 #pragma once
 
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
diff --git a/backends/metax_gpu/kernels/metax_kernel/batch_fc_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/batch_fc_grad_kernel_register.cu
index 51f8f6792e2..c31d82920b3 100644
--- a/backends/metax_gpu/kernels/metax_kernel/batch_fc_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/batch_fc_grad_kernel_register.cu
@@ -14,10 +14,10 @@
 
 #include <string>
 
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 
 namespace phi {
diff --git a/backends/metax_gpu/kernels/metax_kernel/block_attn.h b/backends/metax_gpu/kernels/metax_kernel/block_attn.h
index 1e1eb2c0961..a5b88e34be1 100644
--- a/backends/metax_gpu/kernels/metax_kernel/block_attn.h
+++ b/backends/metax_gpu/kernels/metax_kernel/block_attn.h
@@ -14,11 +14,11 @@
 
 #pragma once
 
-#include "kernels/funcs/quant_dequant.h"
 #include "kernels/metax_kernel/mmha_util.cu.h"
 #include "paddle/common/flags.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/memory_utils.h"
+#include "paddle/phi/kernels/funcs/quant_dequant.h"
 
 COMMON_DECLARE_bool(use_xqa_optim);
 COMMON_DECLARE_bool(blha_use_fp32_qk_sum);
diff --git a/backends/metax_gpu/kernels/metax_kernel/elementwise.h b/backends/metax_gpu/kernels/metax_kernel/elementwise.h
index 52a7709424b..b9f3d8af1c9 100644
--- a/backends/metax_gpu/kernels/metax_kernel/elementwise.h
+++ b/backends/metax_gpu/kernels/metax_kernel/elementwise.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
diff --git a/backends/metax_gpu/kernels/metax_kernel/metax_context.h b/backends/metax_gpu/kernels/metax_kernel/metax_context.h
index 7386811a236..18f1e30f191 100644
--- a/backends/metax_gpu/kernels/metax_kernel/metax_context.h
+++ b/backends/metax_gpu/kernels/metax_kernel/metax_context.h
@@ -17,9 +17,9 @@
 #include <functional>
 #include <mutex>
 
-#include "kernels/funcs/blas/cublasLt.h"
 #include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/backends/custom/custom_context.h"
+#include "paddle/phi/backends/dynload/cublasLt.h"
 #include "paddle/phi/backends/gpu/forwards.h"
 #include "paddle/phi/backends/gpu/gpu_decls.h"
 #include "paddle/phi/backends/gpu/gpu_helper.h"
@@ -28,8 +28,6 @@
 #include "paddle/phi/core/attribute.h"
 #include "paddle/phi/core/device_context.h"
 
-cublasLtHandle_t GetBlasLtHandle();
-
 namespace phi {
 class DnnWorkspaceHandle {
  public:
diff --git a/backends/metax_gpu/kernels/metax_kernel/mv_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/mv_grad_kernel_register.cu
index 895484324a9..8cf069c0f4b 100644
--- a/backends/metax_gpu/kernels/metax_kernel/mv_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/mv_grad_kernel_register.cu
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/mv_grad_kernel.h"
 
 namespace phi {
diff --git a/backends/metax_gpu/kernels/metax_kernel/quant_dequant.h b/backends/metax_gpu/kernels/metax_kernel/quant_dequant.h
index a37fc8c5c57..80d325530f5 100644
--- a/backends/metax_gpu/kernels/metax_kernel/quant_dequant.h
+++ b/backends/metax_gpu/kernels/metax_kernel/quant_dequant.h
@@ -16,12 +16,12 @@ limitations under the License. */
 
 #include <vector>
 
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/common/hostdevice.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/common/transform.h"
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 
 namespace phi {
 
diff --git a/backends/metax_gpu/kernels/metax_kernel/rank_attention_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/rank_attention_grad_kernel_register.cu
index bee25a721fa..ba33e68aa5e 100644
--- a/backends/metax_gpu/kernels/metax_kernel/rank_attention_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/rank_attention_grad_kernel_register.cu
@@ -17,8 +17,8 @@
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/kernel_registry.h"
-// #include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "kernels/funcs/blas/blas.h"
+// #include "paddle/phi/paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/rank_attention.cu.h"
 
diff --git a/backends/metax_gpu/kernels/metax_kernel/rank_attention_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/rank_attention_kernel_register.cu
index b6a4d2d76e9..eeb9c938888 100644
--- a/backends/metax_gpu/kernels/metax_kernel/rank_attention_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/rank_attention_kernel_register.cu
@@ -17,8 +17,8 @@
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/kernel_registry.h"
-// #include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "kernels/funcs/blas/blas.h"
+// #include "paddle/phi/paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/rank_attention.cu.h"
 
diff --git a/backends/metax_gpu/kernels/metax_kernel/slogdeterminant_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/slogdeterminant_kernel_register.cu
index de263c91c4d..3e9a5683ae4 100644
--- a/backends/metax_gpu/kernels/metax_kernel/slogdeterminant_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/slogdeterminant_kernel_register.cu
@@ -20,12 +20,12 @@
 #include <vector>
 
 #include "glog/logging.h"
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/determinant_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/impl/determinant_kernel_impl.h"
 #include "paddle/phi/kernels/slogdeterminant_kernel.h"
 
diff --git a/backends/metax_gpu/kernels/metax_kernel/triangular_solve_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/triangular_solve_kernel_register.cu
index 5ff3211fe87..ed1ed259437 100644
--- a/backends/metax_gpu/kernels/metax_kernel/triangular_solve_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/triangular_solve_kernel_register.cu
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/common/ddim.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/expand_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
 #include "paddle/phi/kernels/triangular_solve_kernel.h"
 
diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch
index 7ba32b5b399..70553934dfb 100755
--- a/backends/metax_gpu/patch/paddle.patch
+++ b/backends/metax_gpu/patch/paddle.patch
@@ -31,6 +31,56 @@ index bff0f2bf70..9376b5781f 100644
  #include "paddle/phi/core/os_info.h"
  #include "paddle/phi/core/platform/device/gpu/gpu_info.h"
  #include "paddle/phi/core/platform/profiler/utils.h"
+diff --git a/paddle/phi/backends/dynload/cublas.h b/paddle/phi/backends/dynload/cublas.h
+index 62beb53cfe..0b0ac09fc0 100644
+--- a/paddle/phi/backends/dynload/cublas.h
++++ b/paddle/phi/backends/dynload/cublas.h
+@@ -49,7 +49,12 @@ extern void *cublas_dso_handle;
+       std::call_once(cublas_dso_flag, []() {                                \
+         cublas_dso_handle = phi::dynload::GetCublasDsoHandle();             \
+       });                                                                   \
+-      static void *p_##__name = dlsym(cublas_dso_handle, #__name);          \
++      std::string replaced_name = #__name;                                  \
++      replaced_name = replaced_name.replace(0, 2, "mc");                    \
++      int index = replaced_name.find("_", 0);                               \
++      if (index != -1) replaced_name = replaced_name.substr(0, index);      \
++      static void* p_##__name =                                             \
++          dlsym(cublas_dso_handle, replaced_name.c_str());                  \
+       return reinterpret_cast<cublas_func>(p_##__name)(args...);            \
+     }                                                                       \
+   };                                                                        \
+diff --git a/paddle/phi/backends/dynload/cublasLt.h b/paddle/phi/backends/dynload/cublasLt.h
+index 0527e743e7..247a844f18 100644
+--- a/paddle/phi/backends/dynload/cublasLt.h
++++ b/paddle/phi/backends/dynload/cublasLt.h
+@@ -46,12 +46,14 @@ extern void *cublasLt_dso_handle;
+       std::call_once(cublasLt_dso_flag, []() {                              \
+         cublasLt_dso_handle = phi::dynload::GetCublasLtDsoHandle();         \
+       });                                                                   \
+-      static void *p_##__name = dlsym(cublasLt_dso_handle, #__name);        \
++      std::string replaced_name = #__name;                                  \
++      replaced_name = replaced_name.replace(0, 2, "mc");                    \
++      static void* p_##__name =                                             \
++          dlsym(cublasLt_dso_handle, replaced_name.c_str());                \
+       return reinterpret_cast<cublasLt_func>(p_##__name)(args...);          \
+     }                                                                       \
+   };                                                                        \
+   extern DynLoad__##__name __name
+-
+ // APIs available after CUDA 11.1
+ #if CUDA_VERSION >= 11010
+ #define CUBLASLT_BLAS_ROUTINE_EACH(__macro)         \
+@@ -79,8 +81,8 @@ extern void *cublasLt_dso_handle;
+   __macro(cublasLtMatmulAlgoConfigGetAttribute);    \
+   __macro(cublasLtMatmulAlgoGetIds);                \
+   __macro(cublasLtMatmulAlgoCapGetAttribute);       \
+-  __macro(cublasLtMatmulAlgoCheck);                 \
+-  __macro(cublasLtGetCudartVersion);
++  __macro(cublasLtMatmulAlgoCheck);
++  // __macro(cublasLtGetCudartVersion);
+ #else
+ #define CUBLASLT_BLAS_ROUTINE_EACH(__macro)      \
+   __macro(cublasLtCreate);                       \
 diff --git a/paddle/phi/backends/dynload/cudnn.h b/paddle/phi/backends/dynload/cudnn.h
 index c0080f0a5e..458ca3e2e8 100644
 --- a/paddle/phi/backends/dynload/cudnn.h
@@ -210,6 +260,29 @@ index 8ec3cf2792..6f5460df00 100644
        return reinterpret_cast<Func>(p_##__name)(args...);            \
      }                                                                \
    };                                                                 \
+diff --git a/paddle/phi/backends/dynload/dynamic_loader.cc b/paddle/phi/backends/dynload/dynamic_loader.cc
+index 859f696896..87b5100a1b 100644
+--- a/paddle/phi/backends/dynload/dynamic_loader.cc
++++ b/paddle/phi/backends/dynload/dynamic_loader.cc
+@@ -18,7 +18,6 @@ limitations under the License. */
+ #include <cstdlib>
+ #include <string>
+ #include <vector>
+-#include "paddle/phi/backends/dynload/cupti_lib_path.h"
+ #include "paddle/phi/common/port.h"
+ #include "paddle/phi/core/enforce.h"
+ 
+@@ -108,6 +107,10 @@ COMMON_DECLARE_string(win_cuda_bin_dir);
+ #define SPARSELT_LIB_NAME "libcusparseLt.so"
+ #endif
+ 
++#ifndef CUPTI_LIB_PATH
++#define CUPTI_LIB_PATH "@CUPTI_LIBRARY_PATH@"
++#endif
++
+ #ifdef PADDLE_WITH_HIP
+ 
+ PHI_DEFINE_string(miopen_dir,
 diff --git a/paddle/phi/backends/dynload/nvjpeg.h b/paddle/phi/backends/dynload/nvjpeg.h
 index c5309e7e11..3328571380 100644
 --- a/paddle/phi/backends/dynload/nvjpeg.h
@@ -346,21 +419,10 @@ index 4ff2e528a9..23f7f4b583 100644
  
    for (int offset = warpSize / 2; offset > 0; offset /= 2)
 diff --git a/paddle/phi/core/enforce.h b/paddle/phi/core/enforce.h
-index 024a7de73e..1e4cdf16be 100644
+index 024a7de73e..66b373d698 100644
 --- a/paddle/phi/core/enforce.h
 +++ b/paddle/phi/core/enforce.h
-@@ -45,7 +45,9 @@ limitations under the License. */
- #endif
- 
- #ifdef PADDLE_WITH_CUDA
--#include "paddle/phi/backends/dynload/cublas.h"
-+// #include "paddle/phi/backends/dynload/../../../../../cublas.h"
-+#include "../backends/metax_gpu/kernels/funcs/blas/cublas.h"
-+// #include "paddle/phi/backends/dynload/cublas.h"
- #include "paddle/phi/backends/dynload/cudnn.h"
- #include "paddle/phi/backends/dynload/curand.h"
- #include "paddle/phi/backends/dynload/cusolver.h"
-@@ -97,7 +99,7 @@ inline bool is_error(bool stat) { return !stat; }
+@@ -97,7 +97,7 @@ inline bool is_error(bool stat) { return !stat; }
  
  void ThrowWarnInternal(const std::string& message);
  
@@ -369,75 +431,68 @@ index 024a7de73e..1e4cdf16be 100644
  // For cuda, the assertions can affect performance and it is therefore
  // recommended to disable them in production code
  // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#assertion
-@@ -109,7 +111,7 @@ void ThrowWarnInternal(const std::string& message);
+@@ -109,7 +109,7 @@ void ThrowWarnInternal(const std::string& message);
               __LINE__,                                             \
               #_IS_NOT_ERROR,                                       \
               ##__VA_ARGS__);                                       \
 -      asm("trap;");                                                \
-+      __builtin_trap();                                             \
++      __builtin_trap();                                            \
      }                                                              \
    } while (0)
  #elif defined(__HIPCC__)
-@@ -757,4 +759,4 @@ inline void retry_sleep(unsigned millisecond) {
- 
- }  // namespace enforce
- using namespace enforce;  // NOLINT
--}  // namespace phi
-+}  // namespace phi
-\ No newline at end of file
-diff --git a/paddle/phi/core/platform/device/gpu/gpu_types.h b/paddle/phi/core/platform/device/gpu/gpu_types.h
-index c646e487d0..325122175c 100644
---- a/paddle/phi/core/platform/device/gpu/gpu_types.h
-+++ b/paddle/phi/core/platform/device/gpu/gpu_types.h
-@@ -25,8 +25,9 @@
- #else
- #include <cuda_runtime.h>
- 
--#include "paddle/phi/backends/dynload/cublas.h"
--#include "paddle/phi/backends/dynload/cublasLt.h"
-+// #include "paddle/phi/backends/dynload/cublas.h"
-+#include "kernels/funcs/blas/cublas.h"
-+// #include "paddle/phi/backends/dynload/cublasLt.h"
- #include "paddle/phi/backends/dynload/cudnn.h"
- #endif
- 
-@@ -90,7 +91,7 @@ DECLARE_TYPE_FOR_GPU(gpuStreamCaptureMode,
- 
- // TODO(Ming Huang): Since there is no blasLt handler,
- // use rocblas_handle for workaround.
--DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle);
-+// DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle);
- 
- #undef DECLARE_TYPE_FOR_GPU
- 
-diff --git a/paddle/phi/core/platform/device_context.h b/paddle/phi/core/platform/device_context.h
-index 2d02eb370b..8a7233e34e 100644
---- a/paddle/phi/core/platform/device_context.h
-+++ b/paddle/phi/core/platform/device_context.h
-@@ -25,8 +25,8 @@ limitations under the License. */
- #include "paddle/phi/core/platform/device/gpu/gpu_types.h"
- #include "paddle/phi/core/platform/device_type.h"
- #ifdef PADDLE_WITH_CUDA
--#include "paddle/phi/backends/dynload/cublas.h"
--#include "paddle/phi/backends/dynload/cublasLt.h"
-+#include "kernels/funcs/blas/cublas.h"
-+#include "kernels/funcs/blas/cublasLt.h"
- #include "paddle/phi/backends/dynload/cudnn.h"
- #include "paddle/phi/backends/dynload/cusolver.h"
- #include "paddle/phi/backends/dynload/cusparse.h"
-diff --git a/paddle/phi/kernels/cpu/index_select_impl.h b/paddle/phi/kernels/cpu/index_select_impl.h
-index d69eb67d6f..1d8b6e9375 100644
---- a/paddle/phi/kernels/cpu/index_select_impl.h
-+++ b/paddle/phi/kernels/cpu/index_select_impl.h
-@@ -18,7 +18,7 @@
+diff --git a/paddle/phi/kernels/funcs/blas/blaslt_gemm_search.h b/paddle/phi/kernels/funcs/blas/blaslt_gemm_search.h
+index e63b3d2f6e..95d7e6f204 100644
+--- a/paddle/phi/kernels/funcs/blas/blaslt_gemm_search.h
++++ b/paddle/phi/kernels/funcs/blas/blaslt_gemm_search.h
+@@ -628,7 +628,13 @@ class CublasLtAlgoCache {
+     infile >> cublaslt_version;
+     VLOG(1) << "cublaslt_version " << cublaslt_version;
+ 
+-    if (dynload::cublasLtGetCudartVersion() != cublaslt_version) {
++    // if (dynload::cublasLtGetCudartVersion() != cublaslt_version) {
++    //   LOG(INFO) << algo_caches_file_
++    //             << " is not compatible with current cublaslt_version "
++    //             << real_cublaslt_version;
++    //   return;
++    // }
++    if (3000 != cublaslt_version) {
+       LOG(INFO) << algo_caches_file_
+                 << " is not compatible with current cublaslt_version "
+                 << real_cublaslt_version;
+@@ -655,7 +661,8 @@ class CublasLtAlgoCache {
+       if (dev == 0) {
+         std::ofstream outfile;
+         outfile.open(algo_caches_file_, std::ios::out | std::ios::trunc);
+-        outfile << dynload::cublasLtGetCudartVersion() << std::endl;
++        // outfile << dynload::cublasLtGetCudartVersion() << std::endl;
++        outfile << 3000 << std::endl;
+ 
+         for (const auto& [seed, algo] : algo_caches_) {
+           outfile << seed << " ";
+diff --git a/paddle/phi/kernels/funcs/cublaslt.h b/paddle/phi/kernels/funcs/cublaslt.h
+index e7e1dd2370..583c7d6474 100644
+--- a/paddle/phi/kernels/funcs/cublaslt.h
++++ b/paddle/phi/kernels/funcs/cublaslt.h
+@@ -42,19 +42,11 @@ class CublasLtHelper {
+   CublasLtHelper(int m, int k, int n, cublasLtHandle_t handle)
+       : handle_(handle), alpha_(1), beta_(0), m_(m), k_(k), n_(n) {
+     cublasStatus_t status;
+-#if CUBLAS_VER_MAJOR < 11
+-    cudaDataType_t cudaComputeType = CUDA_R_32I;
+-#else
+     cublasComputeType_t cudaComputeType = CUBLAS_COMPUTE_32I;
+-#endif
  
- #include "paddle/phi/core/dense_tensor.h"
- #include "paddle/phi/core/tensor_utils.h"
--#include "paddle/phi/kernels/funcs/blas/blas.h"
-+#include "kernels/funcs/blas/blas.h"
- #include "paddle/phi/kernels/funcs/eigen/common.h"
- #include "paddle/phi/kernels/funcs/math_function.h"
+     // matmul desc
+-#if CUBLAS_VER_MAJOR < 11
+-    status = dyl::cublasLtMatmulDescCreate(&matmul_desc_, cudaComputeType);
+-#else
+     status = dyl::cublasLtMatmulDescCreate(
+         &matmul_desc_, cudaComputeType, CUDA_R_32I);
+-#endif
  
+     PADDLE_ENFORCE_EQ(
+         status,
 diff --git a/paddle/phi/kernels/funcs/embedding_grad.h b/paddle/phi/kernels/funcs/embedding_grad.h
 index 461e6e2474..48a64ae9ce 100644
 --- a/paddle/phi/kernels/funcs/embedding_grad.h
@@ -453,38 +508,6 @@ index 461e6e2474..48a64ae9ce 100644
  #endif
    dim3 threads(kWarpSize, kBlockDimY);
    dim3 grids(static_cast<int>((D + kWarpSize - 1) / kWarpSize));
-diff --git a/paddle/phi/kernels/funcs/fc_functor.cu b/paddle/phi/kernels/funcs/fc_functor.cu
-index cb35feee32..64f5bd24ac 100644
---- a/paddle/phi/kernels/funcs/fc_functor.cu
-+++ b/paddle/phi/kernels/funcs/fc_functor.cu
-@@ -16,12 +16,12 @@ limitations under the License. */
- 
- #include "paddle/phi/backends/all_context.h"
- #include "paddle/phi/kernels/funcs/aligned_vector.h"
--#include "paddle/phi/kernels/funcs/blas/blas.h"
-+#include "kernels/funcs/blas/blas.h"
- #include "paddle/phi/kernels/funcs/fc_functor.h"
- 
- #include "paddle/phi/backends/gpu/gpu_launch_config.h"
- #include "paddle/phi/core/dense_tensor.h"
--#include "paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h"
-+// #include "paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h"
- #include "paddle/phi/kernels/funcs/quant_dequant.h"
- #include "paddle/phi/kernels/matmul_kernel.h"
- 
-diff --git a/paddle/phi/kernels/funcs/gru_compute.cu b/paddle/phi/kernels/funcs/gru_compute.cu
-index 88663ec880..98b93072a3 100644
---- a/paddle/phi/kernels/funcs/gru_compute.cu
-+++ b/paddle/phi/kernels/funcs/gru_compute.cu
-@@ -12,7 +12,7 @@ limitations under the License. */
- #include "paddle/phi/kernels/funcs/gru_compute.h"
- 
- #include "paddle/phi/backends/gpu/gpu_context.h"
--#include "paddle/phi/kernels/funcs/blas/blas.h"
-+#include "kernels/funcs/blas/blas.h"
- #include "paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h"
- #include "paddle/phi/kernels/funcs/detail/gru_kernel.h"
- 
 diff --git a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
 index 4eae698648..5c047723ea 100644
 --- a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
@@ -503,19 +526,6 @@ index 4eae698648..5c047723ea 100644
  #endif
    return block_dim >= kMaxBlockDim ? kMaxBlockDim : lwarpSize;
  }
-diff --git a/paddle/phi/kernels/funcs/math/context_project.h b/paddle/phi/kernels/funcs/math/context_project.h
-index 15e1a4a3c3..e4780538d7 100644
---- a/paddle/phi/kernels/funcs/math/context_project.h
-+++ b/paddle/phi/kernels/funcs/math/context_project.h
-@@ -18,7 +18,7 @@
- #include <vector>
- 
- #include "paddle/phi/core/tensor_utils.h"
--#include "paddle/phi/kernels/funcs/blas/blas.h"
-+#include "kernels/funcs/blas/blas.h"
- #include "paddle/phi/kernels/funcs/im2col.h"
- 
- namespace phi {
 diff --git a/paddle/phi/kernels/funcs/math_cuda_utils.h b/paddle/phi/kernels/funcs/math_cuda_utils.h
 index e5361b836e..5ad238df08 100644
 --- a/paddle/phi/kernels/funcs/math_cuda_utils.h
@@ -559,51 +569,20 @@ index e5361b836e..5ad238df08 100644
    return val;
  }
  
-diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cu b/paddle/phi/kernels/funcs/matrix_inverse.cu
-index e101224970..a52eb6096f 100644
---- a/paddle/phi/kernels/funcs/matrix_inverse.cu
-+++ b/paddle/phi/kernels/funcs/matrix_inverse.cu
-@@ -15,11 +15,13 @@ limitations under the License. */
- #include "paddle/phi/kernels/funcs/matrix_inverse.h"
- 
- #include "paddle/phi/common/memory_utils.h"
--#include "paddle/phi/kernels/funcs/blas/blas.h"
-+#include "kernels/funcs/blas/blas.h"
- 
+diff --git a/paddle/phi/kernels/funcs/quant_dequant.h b/paddle/phi/kernels/funcs/quant_dequant.h
+index 8f0736f64e..f11c29a6ef 100644
+--- a/paddle/phi/kernels/funcs/quant_dequant.h
++++ b/paddle/phi/kernels/funcs/quant_dequant.h
+@@ -19,9 +19,7 @@ limitations under the License. */
+ #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+ #include "paddle/phi/common/transform.h"
+ #include "paddle/phi/kernels/funcs/aligned_vector.h"
+-#ifndef PADDLE_WITH_CUSTOM_DEVICE
+ #include "paddle/phi/kernels/funcs/blas/blas.h"
+-#endif
  namespace phi {
- namespace funcs {
- 
-+
-+
- template <typename Context, typename T>
- void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
-                                                   const DenseTensor& a,
-diff --git a/paddle/phi/kernels/funcs/matrix_solve.cu b/paddle/phi/kernels/funcs/matrix_solve.cu
-index 558d363b39..05da04b517 100644
---- a/paddle/phi/kernels/funcs/matrix_solve.cu
-+++ b/paddle/phi/kernels/funcs/matrix_solve.cu
-@@ -16,7 +16,7 @@ limitations under the License. */
- #include "paddle/phi/backends/gpu/cuda/cudnn_workspace_helper.h"
- #include "paddle/phi/common/memory_utils.h"
- #include "paddle/phi/core/tensor_utils.h"
--#include "paddle/phi/kernels/funcs/blas/blas.h"
-+#include "kernels/funcs/blas/blas.h"
- #include "paddle/phi/kernels/funcs/math_function.h"
- #include "paddle/phi/kernels/funcs/scatter.cu.h"
- 
-diff --git a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
-index 047f52bd91..a05b34d3ba 100644
---- a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
-+++ b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
-@@ -27,7 +27,7 @@ namespace cub = hipcub;
- 
- #include "paddle/phi/kernels/funcs/multihead_matmul_functor.h"
- 
--#include "paddle/phi/kernels/funcs/blas/blas.h"
-+#include "kernels/funcs/blas/blas.h"
- #include "paddle/phi/kernels/funcs/math_cuda_utils.h"
  
- namespace phi {
+ using backends::gpu::GpuLaunchConfig;
 diff --git a/paddle/phi/kernels/funcs/top_k_function_cuda.h b/paddle/phi/kernels/funcs/top_k_function_cuda.h
 index e30d440ff3..108edda7ca 100644
 --- a/paddle/phi/kernels/funcs/top_k_function_cuda.h
@@ -873,31 +852,17 @@ index e30d440ff3..108edda7ca 100644
  }  // namespace funcs
  }  // namespace phi
 +//
-diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h b/paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h
-index 32db61532f..0220316bc3 100644
---- a/paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h
-+++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h
-@@ -15,7 +15,7 @@
- #pragma once
- 
- #if defined(PADDLE_WITH_CUDA)
--#include "paddle/phi/backends/dynload/cublasLt.h"
-+// #include "paddle/phi/backends/dynload/cublasLt.h"
- #endif
- 
- #include "glog/logging.h"
 diff --git a/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h b/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h
-index 9d4bb18d55..ea42cc10a9 100644
+index 9d4bb18d55..80405c2b78 100644
 --- a/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h
 +++ b/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h
-@@ -638,9 +638,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
+@@ -638,9 +638,6 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
          RandVec<VecSize>(&state, rand);
  #pragma unroll
          for (int jt = 0; jt < VecSize; jt++) {
 -#ifndef PADDLE_WITH_HIP
 -#pragma unroll
 -#endif
-+// #pragma unroll
            mask_vec[it][jt] = static_cast<MaskType>(rand[jt] >= dropout_prob);
          }
        }
@@ -942,19 +907,6 @@ index f0cca0f701..02ea957240 100644
  
  namespace phi {
  // To determine use cudnn or not.
-diff --git a/paddle/phi/kernels/gpu/dot_kernel.cu b/paddle/phi/kernels/gpu/dot_kernel.cu
-index af27ac89ab..ee0edc6b8e 100644
---- a/paddle/phi/kernels/gpu/dot_kernel.cu
-+++ b/paddle/phi/kernels/gpu/dot_kernel.cu
-@@ -15,7 +15,7 @@
- #include "paddle/phi/kernels/dot_kernel.h"
- #include "paddle/phi/backends/gpu/gpu_context.h"
- #include "paddle/phi/core/kernel_registry.h"
--#include "paddle/phi/kernels/funcs/blas/blas.h"
-+#include "kernels/funcs/blas/blas.h"
- #include "paddle/phi/kernels/funcs/eigen/common.h"
- 
- #include "paddle/phi/kernels/full_kernel.h"
 diff --git a/paddle/phi/kernels/gpu/gelu_funcs.h b/paddle/phi/kernels/gpu/gelu_funcs.h
 index 29fa252e96..4ae72b0935 100644
 --- a/paddle/phi/kernels/gpu/gelu_funcs.h
@@ -1019,84 +971,6 @@ index 1bdbe1564c..f753b54bc6 100644
  #include "paddle/phi/kernels/impl/qr_kernel_impl.h"
  #include "paddle/phi/kernels/impl/tril_triu_kernel_impl.h"
  #include "paddle/phi/kernels/lstsq_kernel.h"
-diff --git a/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h b/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h
-index 9bc5326c90..79b57a8203 100644
---- a/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h
-+++ b/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h
-@@ -21,7 +21,7 @@ limitations under the License. */
- #include "paddle/phi/common/amp_type_traits.h"
- #include "paddle/phi/kernels/addmm_grad_kernel.h"
- #include "paddle/phi/kernels/full_kernel.h"
--#include "paddle/phi/kernels/funcs/blas/blas.h"
-+#include "kernels/funcs/blas/blas.h"
- #include "paddle/phi/kernels/funcs/eigen/common.h"
- #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
- #include "paddle/phi/kernels/funcs/for_range.h"
-diff --git a/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h b/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h
-index cf80666b4e..ca76e055fb 100644
---- a/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h
-+++ b/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h
-@@ -19,7 +19,7 @@ limitations under the License. */
- 
- #include "paddle/phi/common/amp_type_traits.h"
- #include "paddle/phi/kernels/baddbmm_grad_kernel.h"
--#include "paddle/phi/kernels/funcs/blas/blas.h"
-+#include "kernels/funcs/blas/blas.h"
- #include "paddle/phi/kernels/funcs/eigen/common.h"
- #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
- #include "paddle/phi/kernels/funcs/for_range.h"
-diff --git a/paddle/phi/kernels/impl/baddbmm_kernel_impl.h b/paddle/phi/kernels/impl/baddbmm_kernel_impl.h
-index 2789cb59a2..b91b076f7f 100644
---- a/paddle/phi/kernels/impl/baddbmm_kernel_impl.h
-+++ b/paddle/phi/kernels/impl/baddbmm_kernel_impl.h
-@@ -20,7 +20,7 @@ limitations under the License. */
- 
- #include "paddle/phi/common/amp_type_traits.h"
- #include "paddle/phi/kernels/baddbmm_kernel.h"
--#include "paddle/phi/kernels/funcs/blas/blas.h"
-+#include "kernels/funcs/blas/blas.h"
- #include "paddle/phi/kernels/funcs/eigen/common.h"
- #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
- 
-diff --git a/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h
-index 9a21c23666..86413d1577 100644
---- a/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h
-+++ b/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h
-@@ -19,7 +19,7 @@
- #include "paddle/phi/kernels/conv_transpose_grad_kernel.h"
- #include "paddle/phi/kernels/cpu/conv_util.h"
- #include "paddle/phi/kernels/full_kernel.h"
--#include "paddle/phi/kernels/funcs/blas/blas.h"
-+#include "kernels/funcs/blas/blas.h"
- #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
- #include "paddle/phi/kernels/funcs/im2col.h"
- #include "paddle/phi/kernels/funcs/slice.h"
-diff --git a/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
-index 4459a931da..837c8682b8 100644
---- a/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
-+++ b/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
-@@ -18,7 +18,7 @@
- #include "paddle/phi/core/dense_tensor.h"
- #include "paddle/phi/kernels/empty_kernel.h"
- #include "paddle/phi/kernels/full_kernel.h"
--#include "paddle/phi/kernels/funcs/blas/blas.h"
-+#include "kernels/funcs/blas/blas.h"
- #include "paddle/phi/kernels/funcs/deformable_conv_functor.h"
- 
- namespace phi {
-diff --git a/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
-index ad9e9197dd..5478d9817d 100644
---- a/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
-+++ b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
-@@ -18,7 +18,7 @@
- #include "paddle/phi/core/dense_tensor.h"
- #include "paddle/phi/kernels/empty_kernel.h"
- #include "paddle/phi/kernels/full_kernel.h"
--#include "paddle/phi/kernels/funcs/blas/blas.h"
-+#include "kernels/funcs/blas/blas.h"
- #include "paddle/phi/kernels/funcs/deformable_conv_functor.h"
- #include "paddle/phi/kernels/transpose_kernel.h"
- #include "paddle/utils/optional.h"
 diff --git a/paddle/phi/kernels/impl/gammaincc_kernel_impl.h b/paddle/phi/kernels/impl/gammaincc_kernel_impl.h
 index e6b3960f6d..564125f1f6 100644
 --- a/paddle/phi/kernels/impl/gammaincc_kernel_impl.h
@@ -1112,80 +986,3 @@ index e6b3960f6d..564125f1f6 100644
  
    if ((x <= T{0}) || (a <= T{0})) return (T{1.0});
  
-diff --git a/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h b/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h
-index 410fb3c560..009ce03440 100644
---- a/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h
-+++ b/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h
-@@ -54,7 +54,7 @@ HOSTDEVICE T digamma_positive_domain(T x) {
- 
- template <typename T>
- HOSTDEVICE T digamma(T x) {
--  static T pi = T{3.14159265358979323846};
-+  const static T pi = T{3.14159265358979323846};
- 
-   if (x == T{0.0}) {
-     T inf = std::numeric_limits<T>::infinity();
-diff --git a/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h b/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h
-index 5ebbc8d2db..c7b6c338e2 100644
---- a/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h
-+++ b/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h
-@@ -15,8 +15,9 @@ limitations under the License. */
- #include <iostream>
- #include <vector>
- #include "paddle/phi/common/datatype_traits.h"
--#include "paddle/phi/kernels/funcs/cublaslt.h"
--#include "paddle/phi/kernels/funcs/quant_dequant.h"
-+#include "kernels/funcs/blas/cublaslt.h"
-+#include "kernels/funcs/quant_dequant.h"
-+#include "kernels/metax_kernel/metax_context.h"
- 
- #pragma once
- 
-@@ -668,7 +669,7 @@ void LLMGemm(const phi::GPUContext& dev_ctx,
- 
-   {
-     auto helper =
--        std::make_unique<CublasLtHelper>(m, k, n, dev_ctx.cublaslt_handle());
-+        std::make_unique<CublasLtHelper>(m, k, n, GetBlasLtHandle());
-     helper->GEMM(quant_input.data<int8_t>(),
-                  weight->data<int8_t>(),
-                  int_out.data<int32_t>(),
-diff --git a/paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h b/paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h
-index 1f319c4ae3..9186eb6906 100644
---- a/paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h
-+++ b/paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h
-@@ -15,7 +15,7 @@ limitations under the License. */
- #pragma once
- 
- #include "paddle/phi/core/dense_tensor.h"
--#include "paddle/phi/kernels/funcs/blas/blas.h"
-+#include "kernels/funcs/blas/blas.h"
- #include "paddle/phi/kernels/funcs/matrix_inverse.h"
- 
- namespace phi {
-diff --git a/paddle/phi/kernels/impl/matrix_power_kernel_impl.h b/paddle/phi/kernels/impl/matrix_power_kernel_impl.h
-index 6f03f76eeb..5fe2c3e7dc 100644
---- a/paddle/phi/kernels/impl/matrix_power_kernel_impl.h
-+++ b/paddle/phi/kernels/impl/matrix_power_kernel_impl.h
-@@ -15,7 +15,7 @@ limitations under the License. */
- #pragma once
- 
- #include "paddle/phi/core/dense_tensor.h"
--#include "paddle/phi/kernels/funcs/blas/blas.h"
-+#include "kernels/funcs/blas/blas.h"
- #include "paddle/phi/kernels/funcs/for_range.h"
- #include "paddle/phi/kernels/funcs/matrix_inverse.h"
- 
-diff --git a/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h b/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h
-index 4099d8b506..baef2cd643 100644
---- a/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h
-+++ b/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h
-@@ -14,7 +14,7 @@
- 
- #pragma once
- 
--#include "paddle/phi/kernels/funcs/blas/blas.h"
-+#include "kernels/funcs/blas/blas.h"
- #include "paddle/phi/kernels/funcs/eigen/common.h"
- #include "paddle/phi/kernels/funcs/math_function.h"
- 
diff --git a/backends/metax_gpu/runtime/runtime.cc b/backends/metax_gpu/runtime/runtime.cc
index 36fbd88c2ea..edbe937e7ba 100644
--- a/backends/metax_gpu/runtime/runtime.cc
+++ b/backends/metax_gpu/runtime/runtime.cc
@@ -36,12 +36,12 @@
 #include <unordered_map>
 
 #include "glog/logging.h"
-#include "kernels/funcs/blas/cublasLt.h"
 #include "paddle/fluid/platform/profiler/cuda_tracer.h"
 #include "paddle/fluid/platform/profiler/cupti_data_process.h"
 #include "paddle/phi/api/profiler/trace_event_collector.h"
 #include "paddle/phi/backends/device_base.h"
 #include "paddle/phi/backends/device_ext.h"
+#include "paddle/phi/backends/dynload/cublasLt.h"
 #include "paddle/phi/backends/dynload/cupti.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/allocator.h"

From f3b6cc45ed5726520e25fc3d65a75ad34168ac40 Mon Sep 17 00:00:00 2001
From: ZhouDuan <136539532+1184319564@users.noreply.github.com>
Date: Mon, 20 Oct 2025 17:06:44 +0800
Subject: [PATCH 076/121] fix activation_grad kernel (#118)

* [Metax_change_ut]

* fix sum&collect_fpn_proposals op register

* modify profile

* [Metax] fix paddle bug replace 'MoeGradDispatchKernel' to 'MoeGateDispatchKernel'

* [Metax] register bce_loss_grad & bce_loss & index_add_grad kernels

* [Metax] con2d_grad use gpudnn

* blas handle support

* [Metax] register some kernels & update CMakeLists

* [Metax] fix metax unittest fail

* [Metax] add group_norm & label_smooth kernel and update matmul kernel

* [Metax] fix rmsprop kernel register and add meshgrid & meshgrid_grad kernel register

* add test

* add test

* [test]  chang the logic of workspace_host in cholesky_kernel_register

alloc(cpuplace,size), test pass
alloc(cpuplace, size, stream), crash

* [Metax] fix compile fail

* Revert "[Metax] fix compile fail"

This reverts commit 83bc87f686227962b0262e044225c6ed5507b824.

* [Metax] fix compile fail by 'conv_transpose_grad_kernel_impl.h'

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] con2d_grad use gpudnn

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] change_patch

* [Metax] update unit test CMakeLists.txt

* [Metax] update unit test CMakeLists.txt

* [feature] add unique_consecutive kernel

* [metax] add some kernel

* [metax] add some kernel

* [Metax] register baddbmm kernel & update blas api

* [Metax] register baddbmm kernel & update blas api

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [feature]  add add unique_consecutive kernel.cu

* [fix] fix some test case due to missing op register

* [fix]  fix some fail text

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax]fix lu eigvalshsqueeze rnn kernel

* add and fix some kernels

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [Metax] fix conflict

* [Metax] adapt to paddle-cpu-20250901 & resolve the issue of 'test_elementwise_mul_op_metax' failure

* [Metax] update repeat_interleave kernel & ignore max op test

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax] chang patch fix copy

* [metax] chang patch fix copy

* [Metax] update metax_gpu unit test

* [Metax] fix test CMakeList.txt

* fix some tests

* add one test

* fix one kernel

---------

Co-authored-by: sw <1640472053@qq.com>
Co-authored-by: duqimeng <77875733+duqimeng@users.noreply.github.com>
Co-authored-by: Mingkun.Zhang <2496808993@qq.com>
Co-authored-by: metax666 <metax_pde@outlook.com>
Co-authored-by: jiaxinWang-metax <189149612@qq.com>
Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Co-authored-by: chezhang <1376507468@qq.com>
Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com>
---
 .../activation_grad_kernel_register.cu        | 166 ++++++++++--------
 1 file changed, 91 insertions(+), 75 deletions(-)

diff --git a/backends/metax_gpu/kernels/cuda_kernels/activation_grad_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/activation_grad_kernel_register.cu
index 6c46ef10c0f..d49e74dea73 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/activation_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/activation_grad_kernel_register.cu
@@ -15,8 +15,6 @@ limitations under the License. */
 #include "glog/logging.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_device_function.h"
-#include "paddle/phi/common/bfloat16.h"
-#include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/activation_grad_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
@@ -119,6 +117,7 @@ void ActivationGradGPUImpl(const Context& dev_ctx,
     ActivationGradGPUImpl<T, Context, funcs::functor_class<T>>( \
         dev_ctx, &x, nullptr, &dout, dx, functor);              \
   }
+
 #define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_DOUBLE_ATTRS_DEPX(  \
     name, functor_class, attr1, attr2)                          \
   template <typename T, typename Context>                       \
@@ -135,6 +134,7 @@ void ActivationGradGPUImpl(const Context& dev_ctx,
     ActivationGradGPUImpl<T, Context, funcs::functor_class<T>>( \
         dev_ctx, &x, nullptr, &dout, dx, functor);              \
   }
+
 #define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(name, functor_class) \
   template <typename T, typename Context>                             \
   void name##GradKernel(const Context& dev_ctx,                       \
@@ -161,6 +161,21 @@ void ActivationGradGPUImpl(const Context& dev_ctx,
         dev_ctx, nullptr, &out, &dout, dx, functor);            \
   }
 
+#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_DOUBLE_ATTRS_DEPOUT( \
+    name, functor_class, attr)                                   \
+  template <typename T, typename Context>                        \
+  void name##GradKernel(const Context& dev_ctx,                  \
+                        const DenseTensor& out,                  \
+                        const DenseTensor& dout,                 \
+                        double attr,                             \
+                        DenseTensor* dx) {                       \
+    funcs::functor_class<T> functor;                             \
+    auto attrs = functor.GetAttrs();                             \
+    *(attrs[0].second) = attr;                                   \
+    ActivationGradGPUImpl<T, Context, funcs::functor_class<T>>(  \
+        dev_ctx, nullptr, &out, &dout, dx, functor);             \
+  }
+
 #define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(       \
     name, functor_class, attr1, attr2)                          \
   template <typename T, typename Context>                       \
@@ -240,9 +255,9 @@ DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Mish,
 DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Celu,
                                                CudaCELUGradFunctor,
                                                alpha);
-DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT(LogitCUDA,
-                                                 CudaLogitGradFunctor,
-                                                 eps);
+DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_DOUBLE_ATTRS_DEPOUT(LogitCUDA,
+                                                        CudaLogitGradFunctor,
+                                                        eps);
 
 DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(HardTanh,
                                                CudaHardTanhGradFunctor,
@@ -266,6 +281,7 @@ DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(ThresholdedRelu,
                                                CudaThresholdedReluGradFunctor,
                                                threshold,
                                                value);
+
 template <typename T, typename Context>
 void SiluGradKernel(const Context& dev_ctx,
                     const DenseTensor& x,
@@ -390,14 +406,14 @@ PD_CUSTOM_KERNEL_REGISTER(relu_grad,
                           phi::ReluGradKernel,
                           float,
                           double,
-                          phi::dtype::float16) {}
+                          phi::float16) {}
 PD_CUSTOM_KERNEL_REGISTER(relu_double_grad,
                           metax_gpu,
                           ALL_LAYOUT,
                           phi::ReluDoubleGradKernel,
                           float,
                           double,
-                          phi::dtype::float16) {}
+                          phi::float16) {}
 #else
 PD_CUSTOM_KERNEL_REGISTER(relu_grad,
                           metax_gpu,
@@ -405,16 +421,16 @@ PD_CUSTOM_KERNEL_REGISTER(relu_grad,
                           phi::ReluGradKernel,
                           float,
                           double,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16) {}
+                          phi::float16,
+                          phi::bfloat16) {}
 PD_CUSTOM_KERNEL_REGISTER(relu_double_grad,
                           metax_gpu,
                           ALL_LAYOUT,
                           phi::ReluDoubleGradKernel,
                           float,
                           double,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16) {}
+                          phi::float16,
+                          phi::bfloat16) {}
 #endif
 
 #define PD_REGISTER_ACTIVATION_GRAD_KERNEL(name, func) \
@@ -424,8 +440,8 @@ PD_CUSTOM_KERNEL_REGISTER(relu_double_grad,
                             phi::func,                 \
                             float,                     \
                             double,                    \
-                            phi::dtype::float16,       \
-                            phi::dtype::bfloat16) {}
+                            phi::float16,              \
+                            phi::bfloat16) {}
 
 #define PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(name, func) \
   PD_CUSTOM_KERNEL_REGISTER(name,                                   \
@@ -434,10 +450,10 @@ PD_CUSTOM_KERNEL_REGISTER(relu_double_grad,
                             phi::func,                              \
                             float,                                  \
                             double,                                 \
-                            phi::dtype::float16,                    \
-                            phi::dtype::bfloat16,                   \
-                            phi::dtype::complex<float>,             \
-                            phi::dtype::complex<double>) {}
+                            phi::float16,                           \
+                            phi::bfloat16,                          \
+                            phi::complex64,                         \
+                            phi::complex128) {}
 
 PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(sin_grad, SinGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(cos_grad, CosGradKernel)
@@ -483,10 +499,10 @@ PD_CUSTOM_KERNEL_REGISTER(exp_grad,
                           double,
                           int,
                           int64_t,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
 
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(softshrink_grad, SoftShrinkGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_shrink_grad, HardShrinkGradKernel)
@@ -502,10 +518,10 @@ PD_CUSTOM_KERNEL_REGISTER(expm1_grad,
                           phi::Expm1GradKernel,
                           float,
                           double,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
 
 PD_CUSTOM_KERNEL_REGISTER(square_grad,
                           metax_gpu,
@@ -515,10 +531,10 @@ PD_CUSTOM_KERNEL_REGISTER(square_grad,
                           double,
                           int,
                           int64_t,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
 PD_CUSTOM_KERNEL_REGISTER(square_double_grad,
                           metax_gpu,
                           ALL_LAYOUT,
@@ -527,10 +543,10 @@ PD_CUSTOM_KERNEL_REGISTER(square_double_grad,
                           double,
                           int,
                           int64_t,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
 
 PD_CUSTOM_KERNEL_REGISTER(sin_double_grad,
                           metax_gpu,
@@ -540,10 +556,10 @@ PD_CUSTOM_KERNEL_REGISTER(sin_double_grad,
                           double,
                           int,
                           int64_t,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
 
 PD_CUSTOM_KERNEL_REGISTER(sin_triple_grad,
                           metax_gpu,
@@ -553,10 +569,10 @@ PD_CUSTOM_KERNEL_REGISTER(sin_triple_grad,
                           double,
                           int,
                           int64_t,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
 
 PD_CUSTOM_KERNEL_REGISTER(cos_double_grad,
                           metax_gpu,
@@ -566,10 +582,10 @@ PD_CUSTOM_KERNEL_REGISTER(cos_double_grad,
                           double,
                           int,
                           int64_t,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
 
 PD_CUSTOM_KERNEL_REGISTER(cos_triple_grad,
                           metax_gpu,
@@ -579,10 +595,10 @@ PD_CUSTOM_KERNEL_REGISTER(cos_triple_grad,
                           double,
                           int,
                           int64_t,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
 
 PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(softsign_grad,
                                                 SoftsignGradKernel)
@@ -604,10 +620,10 @@ PD_CUSTOM_KERNEL_REGISTER(log_double_grad,
                           phi::LogDoubleGradKernel,
                           float,
                           double,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
 PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(hardswish_grad,
                                                 HardSwishGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(swish_grad, SwishGradKernel)
@@ -622,8 +638,8 @@ PD_CUSTOM_KERNEL_REGISTER(rint_grad,
                           int64_t,
                           float,
                           double,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16) {}
+                          phi::float16,
+                          phi::bfloat16) {}
 PD_CUSTOM_KERNEL_REGISTER(round_grad,
                           metax_gpu,
                           ALL_LAYOUT,
@@ -632,10 +648,10 @@ PD_CUSTOM_KERNEL_REGISTER(round_grad,
                           int64_t,
                           float,
                           double,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
 PD_CUSTOM_KERNEL_REGISTER(pow_grad,
                           metax_gpu,
                           ALL_LAYOUT,
@@ -644,10 +660,10 @@ PD_CUSTOM_KERNEL_REGISTER(pow_grad,
                           double,
                           int,
                           int64_t,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
 PD_CUSTOM_KERNEL_REGISTER(pow_double_grad,
                           metax_gpu,
                           ALL_LAYOUT,
@@ -656,10 +672,10 @@ PD_CUSTOM_KERNEL_REGISTER(pow_double_grad,
                           double,
                           int,
                           int64_t,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
 PD_CUSTOM_KERNEL_REGISTER(pow_triple_grad,
                           metax_gpu,
                           ALL_LAYOUT,
@@ -668,10 +684,10 @@ PD_CUSTOM_KERNEL_REGISTER(pow_triple_grad,
                           double,
                           int,
                           int64_t,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
 PD_CUSTOM_KERNEL_REGISTER(ceil_grad,
                           metax_gpu,
                           ALL_LAYOUT,
@@ -683,8 +699,8 @@ PD_CUSTOM_KERNEL_REGISTER(ceil_grad,
                           int16_t,
                           int,
                           int64_t,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16) {}
+                          phi::float16,
+                          phi::bfloat16) {}
 PD_CUSTOM_KERNEL_REGISTER(floor_grad,
                           metax_gpu,
                           ALL_LAYOUT,
@@ -696,5 +712,5 @@ PD_CUSTOM_KERNEL_REGISTER(floor_grad,
                           int16_t,
                           int,
                           int64_t,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16) {}
+                          phi::float16,
+                          phi::bfloat16) {}

From c2bb7099311feb00cfc03050bf02565e89461aa9 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Tue, 21 Oct 2025 15:07:06 +0800
Subject: [PATCH 077/121] updata flag_and_fix_activation

* updata flag_and_fix_activation

* updataignore

---------
---
 backends/metax_gpu/common/flags_declare.cc    |  21 +++
 .../activation_grad_kernel_register.cu        |  21 ++-
 .../activation_kernel_register.cu             | 133 ++++++++++--------
 .../kernels/metax_kernel/mmha_util.cu.h       |  10 +-
 backends/metax_gpu/tests/ignore.txt           |   6 +-
 5 files changed, 119 insertions(+), 72 deletions(-)

diff --git a/backends/metax_gpu/common/flags_declare.cc b/backends/metax_gpu/common/flags_declare.cc
index 6b497cf9fdf..fb656878033 100644
--- a/backends/metax_gpu/common/flags_declare.cc
+++ b/backends/metax_gpu/common/flags_declare.cc
@@ -37,6 +37,27 @@
  */
 
 static constexpr int kDefaultConvWorkspaceSizeLimitMB = 512;
+/**
+ * CUDA related FLAG
+ * Name: FLAGS_cublaslt_exhaustive_search_times
+ * Since Version: 2.3.0
+ * Value Range: int64_t, default=0
+ * Example:
+ * Note: Represents times of exhaustive search to evaluate performance of
+ *       cuBlasLt matmul algorithm (with/without epilogue). Set this flag
+ *       with value > 0 to enable exhaustive search. Default is 0, means
+ *       getting algorithms via heuristic search. There are two search methods
+ *       in cuBlasLt, heuristic search and exhaustive search. Exhaustive search
+ *       attempts all cuBlasLt algorithms to select the fastest, which is very
+ *       time-consuming, and the selected algorithm will be cached for a given
+ *       layer specification Once you change the layer specifications
+ *       (such as M, N and K), it will re-search again.
+ */
+PHI_DEFINE_EXPORTED_int64(
+    cublaslt_exhaustive_search_times,
+    0,
+    "The times of exhaustive search for cuBlasLt matmul with/without "
+    " epilogue algorithms, default is 0, means disabling exhaustive search.");
 
 PHI_DEFINE_EXPORTED_bool(
     cudnn_exhaustive_search,
diff --git a/backends/metax_gpu/kernels/cuda_kernels/activation_grad_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/activation_grad_kernel_register.cu
index d49e74dea73..f5ee4ec25f8 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/activation_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/activation_grad_kernel_register.cu
@@ -101,6 +101,21 @@ void ActivationGradGPUImpl(const Context& dev_ctx,
         dev_ctx, &x, nullptr, &dout, dx, functor);              \
   }
 
+#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_DOUBLE_ATTRS_DEPX(  \
+    name, functor_class, attr)                                  \
+  template <typename T, typename Context>                       \
+  void name##GradKernel(const Context& dev_ctx,                 \
+                        const DenseTensor& x,                   \
+                        const DenseTensor& dout,                \
+                        double attr,                            \
+                        DenseTensor* dx) {                      \
+    funcs::functor_class<T> functor;                            \
+    auto attrs = functor.GetAttrs();                            \
+    *(attrs[0].second) = attr;                                  \
+    ActivationGradGPUImpl<T, Context, funcs::functor_class<T>>( \
+        dev_ctx, &x, nullptr, &dout, dx, functor);              \
+  }
+
 #define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(         \
     name, functor_class, attr1, attr2)                          \
   template <typename T, typename Context>                       \
@@ -239,9 +254,9 @@ DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Log10, CudaLog10GradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Log1p, CudaLog1pGradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Swish, CudaSwishGradFunctor);
 
-DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu,
-                                               CudaLeakyReluGradFunctor,
-                                               alpha);
+DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_DOUBLE_ATTRS_DEPX(LeakyRelu,
+                                                      CudaLeakyReluGradFunctor,
+                                                      alpha);
 DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(SoftShrink,
                                                CudaSoftShrinkGradFunctor,
                                                lambda);
diff --git a/backends/metax_gpu/kernels/cuda_kernels/activation_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/activation_kernel_register.cu
index 363932cfc28..d91e4afd25e 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/activation_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/activation_kernel_register.cu
@@ -14,8 +14,6 @@ limitations under the License. */
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_device_function.h"
-#include "paddle/phi/common/bfloat16.h"
-#include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/activation_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
@@ -75,6 +73,19 @@ void ActivationGPUImpl(const Context& dev_ctx,
         dev_ctx, x, out, functor);                                      \
   }
 
+#define DEFINE_GPU_ACT_KERNEL_WITH_ONE_DOUBLE_ATTRS(name, functor_class, attr) \
+  template <typename T, typename Context>                                      \
+  void name##Kernel(const Context& dev_ctx,                                    \
+                    const DenseTensor& x,                                      \
+                    double attr,                                               \
+                    DenseTensor* out) {                                        \
+    funcs::functor_class<T> functor;                                           \
+    auto attrs = functor.GetAttrs();                                           \
+    *(attrs[0].second) = attr;                                                 \
+    ActivationGPUImpl<T, Context, funcs::functor_class<T>>(                    \
+        dev_ctx, x, out, functor);                                             \
+  }
+
 #define DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(               \
     name, functor_class, attr1, attr2)                      \
   template <typename T, typename Context>                   \
@@ -90,6 +101,7 @@ void ActivationGPUImpl(const Context& dev_ctx,
     ActivationGPUImpl<T, Context, funcs::functor_class<T>>( \
         dev_ctx, x, out, functor);                          \
   }
+
 #define DEFINE_GPU_ACT_KERNEL_WITH_TWO_DOUBLE_ATTRS(        \
     name, functor_class, attr1, attr2)                      \
   template <typename T, typename Context>                   \
@@ -105,6 +117,7 @@ void ActivationGPUImpl(const Context& dev_ctx,
     ActivationGPUImpl<T, Context, funcs::functor_class<T>>( \
         dev_ctx, x, out, functor);                          \
   }
+
 DEFINE_GPU_ACTIVATION_KERNEL(Cos, CudaCosFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(Tan, CudaTanFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(Acos, CudaAcosFunctor)
@@ -138,8 +151,10 @@ DEFINE_GPU_ACTIVATION_KERNEL_WITH_INT_IN_FLOAT_OUT(Log1p, CudaLog1pFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL_WITH_INT_IN_FLOAT_OUT(Exp, CudaExpFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL_WITH_INT_IN_FLOAT_OUT(Expm1, CudaExpm1Functor)
 
-DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, CudaLeakyReluFunctor, alpha)
-DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(LogitCUDA, CudaLogitFunctor, eps)
+DEFINE_GPU_ACT_KERNEL_WITH_ONE_DOUBLE_ATTRS(LeakyRelu,
+                                            CudaLeakyReluFunctor,
+                                            alpha)
+DEFINE_GPU_ACT_KERNEL_WITH_ONE_DOUBLE_ATTRS(LogitCUDA, CudaLogitFunctor, eps)
 DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(HardShrink,
                                      CudaHardShrinkFunctor,
                                      threshold)
@@ -286,13 +301,9 @@ void PowKernel(const Context& dev_ctx,
 }  // namespace phi
 
 #ifdef PADDLE_WITH_HIP
-PD_CUSTOM_KERNEL_REGISTER(relu,
-                          metax_gpu,
-                          ALL_LAYOUT,
-                          phi::ReluKernel,
-                          float,
-                          double,
-                          phi::dtype::float16) {}
+PD_CUSTOM_KERNEL_REGISTER(
+    relu, metax_gpu, ALL_LAYOUT, phi::ReluKernel, float, double, phi::float16) {
+}
 #else
 PD_CUSTOM_KERNEL_REGISTER(relu,
                           metax_gpu,
@@ -300,8 +311,8 @@ PD_CUSTOM_KERNEL_REGISTER(relu,
                           phi::ReluKernel,
                           float,
                           double,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16) {}
+                          phi::float16,
+                          phi::bfloat16) {}
 #endif
 
 #define PD_REGISTER_ACTIVATION_KERNEL(name, func) \
@@ -311,8 +322,8 @@ PD_CUSTOM_KERNEL_REGISTER(relu,
                             phi::func,            \
                             float,                \
                             double,               \
-                            phi::dtype::float16,  \
-                            phi::dtype::bfloat16) {}
+                            phi::float16,         \
+                            phi::bfloat16) {}
 
 #define PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(name, func) \
   PD_CUSTOM_KERNEL_REGISTER(name,                              \
@@ -321,10 +332,10 @@ PD_CUSTOM_KERNEL_REGISTER(relu,
                             phi::func,                         \
                             float,                             \
                             double,                            \
-                            phi::dtype::float16,               \
-                            phi::dtype::bfloat16,              \
-                            phi::dtype::complex<float>,        \
-                            phi::dtype::complex<double>) {}
+                            phi::float16,                      \
+                            phi::bfloat16,                     \
+                            phi::complex64,                    \
+                            phi::complex128) {}
 
 PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(sin, SinKernel)
 PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(cos, CosKernel)
@@ -357,10 +368,10 @@ PD_CUSTOM_KERNEL_REGISTER(exp,
                           double,
                           int,
                           int64_t,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
 PD_CUSTOM_KERNEL_REGISTER(expm1,
                           metax_gpu,
                           ALL_LAYOUT,
@@ -369,10 +380,10 @@ PD_CUSTOM_KERNEL_REGISTER(expm1,
                           double,
                           int,
                           int64_t,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
 PD_CUSTOM_KERNEL_REGISTER(square,
                           metax_gpu,
                           ALL_LAYOUT,
@@ -381,10 +392,10 @@ PD_CUSTOM_KERNEL_REGISTER(square,
                           double,
                           int,
                           int64_t,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
 
 PD_REGISTER_ACTIVATION_KERNEL(hard_shrink, HardShrinkKernel)
 PD_REGISTER_ACTIVATION_KERNEL(softshrink, SoftShrinkKernel)
@@ -409,8 +420,8 @@ PD_CUSTOM_KERNEL_REGISTER(rint,
                           int64_t,
                           float,
                           double,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16) {}
+                          phi::float16,
+                          phi::bfloat16) {}
 PD_CUSTOM_KERNEL_REGISTER(round,
                           metax_gpu,
                           ALL_LAYOUT,
@@ -419,10 +430,10 @@ PD_CUSTOM_KERNEL_REGISTER(round,
                           int64_t,
                           float,
                           double,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
 PD_CUSTOM_KERNEL_REGISTER(log,
                           metax_gpu,
                           ALL_LAYOUT,
@@ -431,10 +442,10 @@ PD_CUSTOM_KERNEL_REGISTER(log,
                           double,
                           int,
                           int64_t,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
 PD_CUSTOM_KERNEL_REGISTER(log2,
                           metax_gpu,
                           ALL_LAYOUT,
@@ -443,10 +454,10 @@ PD_CUSTOM_KERNEL_REGISTER(log2,
                           double,
                           int,
                           int64_t,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
 PD_CUSTOM_KERNEL_REGISTER(log10,
                           metax_gpu,
                           ALL_LAYOUT,
@@ -455,10 +466,10 @@ PD_CUSTOM_KERNEL_REGISTER(log10,
                           double,
                           int,
                           int64_t,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
 PD_CUSTOM_KERNEL_REGISTER(log1p,
                           metax_gpu,
                           ALL_LAYOUT,
@@ -467,10 +478,10 @@ PD_CUSTOM_KERNEL_REGISTER(log1p,
                           double,
                           int,
                           int64_t,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
 PD_CUSTOM_KERNEL_REGISTER(pow,
                           metax_gpu,
                           ALL_LAYOUT,
@@ -479,10 +490,10 @@ PD_CUSTOM_KERNEL_REGISTER(pow,
                           double,
                           int,
                           int64_t,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
 PD_CUSTOM_KERNEL_REGISTER(ceil,
                           metax_gpu,
                           ALL_LAYOUT,
@@ -494,8 +505,8 @@ PD_CUSTOM_KERNEL_REGISTER(ceil,
                           int16_t,
                           int,
                           int64_t,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16) {}
+                          phi::float16,
+                          phi::bfloat16) {}
 PD_CUSTOM_KERNEL_REGISTER(floor,
                           metax_gpu,
                           ALL_LAYOUT,
@@ -507,5 +518,5 @@ PD_CUSTOM_KERNEL_REGISTER(floor,
                           int16_t,
                           int,
                           int64_t,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16) {}
+                          phi::float16,
+                          phi::bfloat16) {}
diff --git a/backends/metax_gpu/kernels/metax_kernel/mmha_util.cu.h b/backends/metax_gpu/kernels/metax_kernel/mmha_util.cu.h
index aa352e600b5..187b0fc534a 100644
--- a/backends/metax_gpu/kernels/metax_kernel/mmha_util.cu.h
+++ b/backends/metax_gpu/kernels/metax_kernel/mmha_util.cu.h
@@ -49,10 +49,10 @@
 
 #pragma once
 
-#if defined(__CUDACC__) && CUDA_VERSION >= 11000
+// #if defined(__CUDACC__) && CUDA_VERSION >= 11000
 #define ENABLE_BF16
 #include <cuda_bf16.h>
-#endif
+// #endif
 
 #ifdef PADDLE_WITH_HIP
 #include <float.h>
@@ -72,8 +72,8 @@ namespace cub = hipcub;
 #endif
 
 #include "paddle/phi/common/datatype_traits.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
-
 #ifdef PADDLE_WITH_HIP
 /// integral_constant
 template <typename _Tp, _Tp __v>
@@ -130,7 +130,7 @@ struct Float4_ {
   float2 y;
 };
 
-#if defined(ENABLE_BF16) || defined(PADDLE_WITH_HIP)
+// #if defined(ENABLE_BF16) || defined(PADDLE_WITH_HIP)
 struct bf16_4_t {
   __nv_bfloat162 x;
   __nv_bfloat162 y;
@@ -142,7 +142,7 @@ struct bf16_8_t {
   __nv_bfloat162 z;
   __nv_bfloat162 w;
 };
-#endif
+// #endif
 
 //-----------------------------------
 template <typename T, CacheType CACHE_TYPE>
diff --git a/backends/metax_gpu/tests/ignore.txt b/backends/metax_gpu/tests/ignore.txt
index be0357e5319..2b0fae559e6 100644
--- a/backends/metax_gpu/tests/ignore.txt
+++ b/backends/metax_gpu/tests/ignore.txt
@@ -24,9 +24,9 @@ test_conv3d_layer
 test_conv3d_transpose_part2_op
 test_fused_conv2d_add_act_op
 test_swiglu_metax
-test_set_value_op
-test_pad_op
 test_squared_l2_norm_op
-test_concat_op
 test_dygraph_spectral_norm
 test_bincount_op
+test_adamw_op
+test_einsum_op
+test_complex_matmul

From 8f161637ce03c6501e2aae5eba993b2ad1ef8778 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Tue, 21 Oct 2025 16:11:49 +0800
Subject: [PATCH 078/121] updata_patch (#120)

* updata_patch

---------
---
 backends/metax_gpu/patch/paddle.patch | 25 ++++++-------------------
 1 file changed, 6 insertions(+), 19 deletions(-)

diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch
index 70553934dfb..4c844e5cc82 100755
--- a/backends/metax_gpu/patch/paddle.patch
+++ b/backends/metax_gpu/patch/paddle.patch
@@ -50,7 +50,7 @@ index 62beb53cfe..0b0ac09fc0 100644
      }                                                                       \
    };                                                                        \
 diff --git a/paddle/phi/backends/dynload/cublasLt.h b/paddle/phi/backends/dynload/cublasLt.h
-index 0527e743e7..247a844f18 100644
+index 8b2e08c777..ca926df151 100644
 --- a/paddle/phi/backends/dynload/cublasLt.h
 +++ b/paddle/phi/backends/dynload/cublasLt.h
 @@ -46,12 +46,14 @@ extern void *cublasLt_dso_handle;
@@ -68,7 +68,7 @@ index 0527e743e7..247a844f18 100644
    extern DynLoad__##__name __name
 -
  // APIs available after CUDA 11.1
- #if CUDA_VERSION >= 11010
+ #if CUDA_VERSION >= 11010 || defined(PADDLE_WITH_CUSTOM_DEVICE)
  #define CUBLASLT_BLAS_ROUTINE_EACH(__macro)         \
 @@ -79,8 +81,8 @@ extern void *cublasLt_dso_handle;
    __macro(cublasLtMatmulAlgoConfigGetAttribute);    \
@@ -440,6 +440,7 @@ index 024a7de73e..66b373d698 100644
      }                                                              \
    } while (0)
  #elif defined(__HIPCC__)
+
 diff --git a/paddle/phi/kernels/funcs/blas/blaslt_gemm_search.h b/paddle/phi/kernels/funcs/blas/blaslt_gemm_search.h
 index e63b3d2f6e..95d7e6f204 100644
 --- a/paddle/phi/kernels/funcs/blas/blaslt_gemm_search.h
@@ -470,7 +471,7 @@ index e63b3d2f6e..95d7e6f204 100644
          for (const auto& [seed, algo] : algo_caches_) {
            outfile << seed << " ";
 diff --git a/paddle/phi/kernels/funcs/cublaslt.h b/paddle/phi/kernels/funcs/cublaslt.h
-index e7e1dd2370..583c7d6474 100644
+index fbbf57c25a..f690db59e9 100644
 --- a/paddle/phi/kernels/funcs/cublaslt.h
 +++ b/paddle/phi/kernels/funcs/cublaslt.h
 @@ -42,19 +42,11 @@ class CublasLtHelper {
@@ -569,20 +570,6 @@ index e5361b836e..5ad238df08 100644
    return val;
  }
  
-diff --git a/paddle/phi/kernels/funcs/quant_dequant.h b/paddle/phi/kernels/funcs/quant_dequant.h
-index 8f0736f64e..f11c29a6ef 100644
---- a/paddle/phi/kernels/funcs/quant_dequant.h
-+++ b/paddle/phi/kernels/funcs/quant_dequant.h
-@@ -19,9 +19,7 @@ limitations under the License. */
- #include "paddle/phi/backends/gpu/gpu_launch_config.h"
- #include "paddle/phi/common/transform.h"
- #include "paddle/phi/kernels/funcs/aligned_vector.h"
--#ifndef PADDLE_WITH_CUSTOM_DEVICE
- #include "paddle/phi/kernels/funcs/blas/blas.h"
--#endif
- namespace phi {
- 
- using backends::gpu::GpuLaunchConfig;
 diff --git a/paddle/phi/kernels/funcs/top_k_function_cuda.h b/paddle/phi/kernels/funcs/top_k_function_cuda.h
 index e30d440ff3..108edda7ca 100644
 --- a/paddle/phi/kernels/funcs/top_k_function_cuda.h
@@ -893,7 +880,7 @@ index b2d15a59f8..f64582e85a 100644
  namespace phi {
  namespace fusion {
 diff --git a/paddle/phi/kernels/gpu/depthwise_conv.h b/paddle/phi/kernels/gpu/depthwise_conv.h
-index f0cca0f701..02ea957240 100644
+index 2edac5eba5..4f265e3db7 100644
 --- a/paddle/phi/kernels/gpu/depthwise_conv.h
 +++ b/paddle/phi/kernels/gpu/depthwise_conv.h
 @@ -29,8 +29,8 @@ namespace cub = hipcub;
@@ -959,7 +946,7 @@ index 63c35dd4ee..15da9aea45 100644
  namespace phi {
  
 diff --git a/paddle/phi/kernels/gpu/lstsq_kernel.cu b/paddle/phi/kernels/gpu/lstsq_kernel.cu
-index 1bdbe1564c..f753b54bc6 100644
+index c7f27b2924..4cf6204ac7 100644
 --- a/paddle/phi/kernels/gpu/lstsq_kernel.cu
 +++ b/paddle/phi/kernels/gpu/lstsq_kernel.cu
 @@ -21,7 +21,7 @@

From b272dbe557db51ffe0def0b38e5d697c721b3995 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Wed, 22 Oct 2025 09:53:37 +0800
Subject: [PATCH 079/121] Update Paddle submodule to latest develop (#121)

Co-authored-by: tianshuo78520a <tianshuo78520a@users.noreply.github.com>
---
 Paddle | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Paddle b/Paddle
index 5dbecdcb0e4..1f00e2178ad 160000
--- a/Paddle
+++ b/Paddle
@@ -1 +1 @@
-Subproject commit 5dbecdcb0e4ddd3488927f49082dfb66c794f9e7
+Subproject commit 1f00e2178ad3249ecd8bb83e59bc6ac1ebcac413

From dc38f3d79c539796767a7454ca1fcd76486441db Mon Sep 17 00:00:00 2001
From: jxwangmetax <189149612@qq.com>
Date: Wed, 22 Oct 2025 10:23:24 +0800
Subject: [PATCH 080/121] [metax] modify kernels (#122)

* modify kernels
---
 backends/metax_gpu/patch/paddle.patch | 158 +++++++++++++++++++++++++-
 1 file changed, 157 insertions(+), 1 deletion(-)

diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch
index 4c844e5cc82..6578029129e 100755
--- a/backends/metax_gpu/patch/paddle.patch
+++ b/backends/metax_gpu/patch/paddle.patch
@@ -440,7 +440,163 @@ index 024a7de73e..66b373d698 100644
      }                                                              \
    } while (0)
  #elif defined(__HIPCC__)
-
+diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
+index ae7b67de6d..fbe9f67737 100644
+--- a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
++++ b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
+@@ -368,7 +368,7 @@ struct CUBlas<phi::float16> {
+                          cudaDataType_t Ctype,
+                          int ldc,
+                          int batchCount,
+-                         cudaDataType_t computeType) {
++                         cublasComputeType_t computeType) {
+ #if CUDA_VERSION >= 8000
+     cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
+ #if CUDA_VERSION >= 9000
+@@ -476,7 +476,7 @@ struct CUBlas<phi::float16> {
+                       void *C,
+                       cudaDataType_t Ctype,
+                       int ldc,
+-                      cudaDataType_t computeType) {
++                      cublasComputeType_t computeType) {
+ #if CUDA_VERSION >= 8000
+     cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
+ #if CUDA_VERSION >= 9000
+@@ -532,7 +532,7 @@ struct CUBlas<phi::float16> {
+                          void *C,
+                          cudaDataType_t Ctype,
+                          int64_t ldc,
+-                         cudaDataType_t computeType) {
++                         cublasComputeType_t computeType) {
+ #if CUDA_VERSION >= 12030 && defined(__linux__)
+     cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
+     bool use_tensor_op_math = dev_ctx->tensor_core_available();
+@@ -759,7 +759,7 @@ struct CUBlas<phi::complex64> {
+                       void *C,
+                       cudaDataType_t Ctype,
+                       int ldc,
+-                      cudaDataType_t computeType) {
++                      cublasComputeType_t computeType) {
+ #if CUDA_VERSION >= 8000
+     cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
+ #if CUDA_VERSION >= 9000
+@@ -815,7 +815,7 @@ struct CUBlas<phi::complex64> {
+                          void *C,
+                          cudaDataType_t Ctype,
+                          int64_t ldc,
+-                         cudaDataType_t computeType) {
++                         cublasComputeType_t computeType) {
+ #if CUDA_VERSION >= 12030 && defined(__linux__)
+     cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
+     bool use_tensor_op_math = dev_ctx->tensor_core_available();
+@@ -1154,7 +1154,7 @@ struct CUBlas<phi::complex128> {
+                       void *C,
+                       cudaDataType_t Ctype,
+                       int ldc,
+-                      cudaDataType_t computeType) {
++                      cublasComputeType_t computeType) {
+ #if CUDA_VERSION >= 8000
+     cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
+ #if CUDA_VERSION >= 9000
+@@ -1210,7 +1210,7 @@ struct CUBlas<phi::complex128> {
+                          void *C,
+                          cudaDataType_t Ctype,
+                          int64_t ldc,
+-                         cudaDataType_t computeType) {
++                         cublasComputeType_t computeType) {
+ #if CUDA_VERSION >= 12030 && defined(__linux__)
+     cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
+     bool use_tensor_op_math = dev_ctx->tensor_core_available();
+@@ -1484,7 +1484,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                      C,
+                                      CUDA_R_16F,
+                                      N,
+-                                     CUDA_R_32F);
++                                     CUBLAS_COMPUTE_32F);
+ #else
+     PADDLE_THROW(common::errors::Unimplemented(
+         "GEMM_EX_64 is not supported on cuda < 12.3"));
+@@ -1508,7 +1508,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                   C,
+                                   CUDA_R_16F,
+                                   static_cast<int>(N),
+-                                  CUDA_R_32F);
++                                  CUBLAS_COMPUTE_32F);
+   }
+ #else
+   // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
+@@ -1694,7 +1694,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                      C,
+                                      CUDA_R_16F,
+                                      N,
+-                                     CUDA_R_32F);
++                                     CUBLAS_COMPUTE_32F);
+ #else
+     PADDLE_THROW(common::errors::Unimplemented(
+         "GEMM_EX_64 is not supported on cuda < 12.3"));
+@@ -1719,7 +1719,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                   C,
+                                   CUDA_R_16F,
+                                   static_cast<int>(N),
+-                                  CUDA_R_32F);
++                                  CUBLAS_COMPUTE_32F);
+ #else
+     // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
+     dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+@@ -1831,7 +1831,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                      C,
+                                      CUDA_R_16BF,
+                                      static_cast<int>(N),
+-                                     CUDA_R_32F,
++                                     CUBLAS_COMPUTE_32F,
+                                      algo));
+     });
+   }
+@@ -1932,7 +1932,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                      C,
+                                      CUDA_R_16BF,
+                                      static_cast<int>(N),
+-                                     CUDA_R_32F,
++                                     CUBLAS_COMPUTE_32F,
+                                      algo));
+     });
+   }
+@@ -2026,7 +2026,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                     C,
+                                     CUDA_C_32F,
+                                     static_cast<int>(N),
+-                                    CUDA_C_32F);
++                                    CUBLAS_COMPUTE_32F);
+ 
+ #else
+     dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+@@ -2111,7 +2111,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                         C,
+                                         CUDA_C_64F,
+                                         N,
+-                                        CUDA_C_64F);
++                                        CUBLAS_COMPUTE_64F);
+ #else
+     PADDLE_THROW(common::errors::Unimplemented(
+         "GEMM_EX_64 is not supported on cuda < 12.3"));
+@@ -2136,7 +2136,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                      C,
+                                      CUDA_C_64F,
+                                      static_cast<int>(N),
+-                                     CUDA_C_64F);
++                                     CUBLAS_COMPUTE_64F);
+ #else  // CUDA_VERSION >= 8000
+     // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
+     dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+@@ -3129,7 +3129,7 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+                                    CUDA_R_16F,
+                                    ldc,
+                                    batchCount,
+-                                   CUDA_R_32F);
++                                   CUBLAS_COMPUTE_32F);
+ }
+ 
+ template <>
 diff --git a/paddle/phi/kernels/funcs/blas/blaslt_gemm_search.h b/paddle/phi/kernels/funcs/blas/blaslt_gemm_search.h
 index e63b3d2f6e..95d7e6f204 100644
 --- a/paddle/phi/kernels/funcs/blas/blaslt_gemm_search.h

From 5fe7108e40ac7179ad8cce5967f5f8fe9d15e7f0 Mon Sep 17 00:00:00 2001
From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Date: Thu, 23 Oct 2025 10:01:26 +0800
Subject: [PATCH 081/121] [Metax] fix weight_quant & weight_only_linear bug
 (#125)

* [Metax] fix weight_quant & weight_only_linear bug
---
 .../kernels/metax_kernel/weight_only_linear_kernel.cu         | 4 ++--
 .../kernels/metax_kernel/weight_quantize_kernel_register.cu   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/backends/metax_gpu/kernels/metax_kernel/weight_only_linear_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/weight_only_linear_kernel.cu
index d2f39ccf751..65cf99d3065 100644
--- a/backends/metax_gpu/kernels/metax_kernel/weight_only_linear_kernel.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/weight_only_linear_kernel.cu
@@ -166,7 +166,7 @@ void WeightOnlyLinearKernel(const Context& dev_ctx,
           mctlassGemmScaleOp_w4a16_nobias::epilogueParams(
               reinterpret_cast<const maca_bfloat16*>(bias_data)),
           mctlassGemmScaleOp_w4a16_nobias::quantscaleParams(
-              1,
+              2,
               group_size,
               reinterpret_cast<const maca_bfloat16*>(weight_scale_data)),
           reinterpret_cast<const maca_bfloat16*>(x_data),
@@ -191,7 +191,7 @@ void WeightOnlyLinearKernel(const Context& dev_ctx,
           mctlassGemmScaleOp_w4a16_bias::epilogueParams(
               reinterpret_cast<const maca_bfloat16*>(bias_data)),
           mctlassGemmScaleOp_w4a16_bias::quantscaleParams(
-              1,
+              2,
               group_size,
               reinterpret_cast<const maca_bfloat16*>(weight_scale_data)),
           reinterpret_cast<const maca_bfloat16*>(x_data),
diff --git a/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu
index 44ac7f2fddc..46045f55c27 100644
--- a/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu
@@ -120,7 +120,7 @@ void WeightQuantizeKernel(const Context& dev_ctx,
                                  weight_shape,
                                  arch,
                                  algo);
-    out->Resize({m, n});
+    out->Resize({m, n / 2});
 #ifdef PADDLE_WITH_HIP
     DenseTensor x_int_tmp(out->type());
     x_int_tmp.Resize({m, n / 2});

From 14a340c28b778cb9926740fb7bd39879af31d449 Mon Sep 17 00:00:00 2001
From: ZhouDuan <136539532+1184319564@users.noreply.github.com>
Date: Fri, 24 Oct 2025 10:27:19 +0800
Subject: [PATCH 082/121] fix and add some kernels (#126)

* fix and add some kernels
---
 ...used_gemm_epilogue_grad_kernel_register.cu | 26 +++++++++++++++++++
 .../fused_gemm_epilogue_kernel_register.cu    | 26 +++++++++++++++++++
 ...d_linear_param_grad_add_kernel_register.cu | 24 +++++++++++++++++
 .../cuda_kernels/pad_grad_kernel_register.cu  |  8 +++---
 .../softmax_kernel_grad_register.cu           |  1 +
 5 files changed, 82 insertions(+), 3 deletions(-)
 create mode 100644 backends/metax_gpu/kernels/cuda_kernels/fused_gemm_epilogue_grad_kernel_register.cu
 create mode 100644 backends/metax_gpu/kernels/cuda_kernels/fused_gemm_epilogue_kernel_register.cu
 create mode 100644 backends/metax_gpu/kernels/cuda_kernels/fused_linear_param_grad_add_kernel_register.cu

diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_gemm_epilogue_grad_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/fused_gemm_epilogue_grad_kernel_register.cu
new file mode 100644
index 00000000000..2e8d33b964c
--- /dev/null
+++ b/backends/metax_gpu/kernels/cuda_kernels/fused_gemm_epilogue_grad_kernel_register.cu
@@ -0,0 +1,26 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/fused_gemm_epilogue.h"
+#include "paddle/phi/kernels/fusion/gpu/fused_gemm_epilogue_grad_kernel.cu"  //NOLINT
+
+PD_CUSTOM_KERNEL_REGISTER(fused_gemm_epilogue_grad,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::fusion::FusedGemmEpilogueGradKernel,
+                          float,
+                          double,
+                          phi::float16,
+                          phi::bfloat16) {}
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_gemm_epilogue_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/fused_gemm_epilogue_kernel_register.cu
new file mode 100644
index 00000000000..9be5794c54f
--- /dev/null
+++ b/backends/metax_gpu/kernels/cuda_kernels/fused_gemm_epilogue_kernel_register.cu
@@ -0,0 +1,26 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/fused_gemm_epilogue.h"
+#include "paddle/phi/kernels/fusion/gpu/fused_gemm_epilogue_kernel.cu"  //NOLINT
+
+PD_CUSTOM_KERNEL_REGISTER(fused_gemm_epilogue,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::fusion::FusedGemmEpilogueKernel,
+                          float,
+                          double,
+                          phi::float16,
+                          phi::bfloat16) {}
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_linear_param_grad_add_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/fused_linear_param_grad_add_kernel_register.cu
new file mode 100644
index 00000000000..c88f94625b7
--- /dev/null
+++ b/backends/metax_gpu/kernels/cuda_kernels/fused_linear_param_grad_add_kernel_register.cu
@@ -0,0 +1,24 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/fusion/gpu/fused_linear_param_grad_add_kernel.cu"  //NOLINT
+PD_CUSTOM_KERNEL_REGISTER(fused_linear_param_grad_add,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::fusion::FusedLinearParamGradAdd,
+                          float,
+                          double,
+                          phi::float16,
+                          phi::bfloat16) {}
diff --git a/backends/metax_gpu/kernels/cuda_kernels/pad_grad_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/pad_grad_kernel_register.cu
index 38b89fce698..f87f589a424 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/pad_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/pad_grad_kernel_register.cu
@@ -20,6 +20,8 @@ PD_CUSTOM_KERNEL_REGISTER(pad_grad,
                           ALL_LAYOUT,
                           phi::PadGradKernel,
                           float,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>) {}
+                          double,
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
diff --git a/backends/metax_gpu/kernels/metax_kernel/softmax_kernel_grad_register.cu b/backends/metax_gpu/kernels/metax_kernel/softmax_kernel_grad_register.cu
index 9b981029fc0..407180deca8 100644
--- a/backends/metax_gpu/kernels/metax_kernel/softmax_kernel_grad_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/softmax_kernel_grad_register.cu
@@ -45,5 +45,6 @@ PD_REGISTER_PLUGIN_KERNEL(softmax_grad,
                           ALL_LAYOUT,
                           phi::SoftmaxGradGPUDNNKernel,
                           float,
+                          double,
                           phi::dtype::float16,
                           phi::dtype::bfloat16) {}

From faac2c969d9b609d3e5443c43ad55e958b6de5b3 Mon Sep 17 00:00:00 2001
From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Date: Tue, 28 Oct 2025 09:55:12 +0800
Subject: [PATCH 083/121] [Metax] fix 'WeightQuantizeKernel' wint4 branch
 (#133)

* [Metax] fix 'WeightQuantizeKernel' wint4 branch
---
 .../kernels/metax_kernel/weight_quantize_kernel_register.cu     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu
index 46045f55c27..cb80385a7a0 100644
--- a/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu
@@ -120,7 +120,7 @@ void WeightQuantizeKernel(const Context& dev_ctx,
                                  weight_shape,
                                  arch,
                                  algo);
-    out->Resize({m, n / 2});
+    out->Resize({m / 2, n});
 #ifdef PADDLE_WITH_HIP
     DenseTensor x_int_tmp(out->type());
     x_int_tmp.Resize({m, n / 2});

From 29630cbb408061521a65129fb68bb1c5d3e9814f Mon Sep 17 00:00:00 2001
From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Date: Wed, 29 Oct 2025 10:18:17 +0800
Subject: [PATCH 084/121] [Metax] add quanted weight layout transformation
 using CPU programming (#135)

* [Metax] adjust quanted weight layout transformation
---
 .../impl/metax_weight_quantize_kernel_impl.h  | 150 ++++++++++++++++++
 .../weight_quantize_kernel_register.cu        |   3 +-
 2 files changed, 152 insertions(+), 1 deletion(-)
 create mode 100644 backends/metax_gpu/kernels/impl/metax_weight_quantize_kernel_impl.h

diff --git a/backends/metax_gpu/kernels/impl/metax_weight_quantize_kernel_impl.h b/backends/metax_gpu/kernels/impl/metax_weight_quantize_kernel_impl.h
new file mode 100644
index 00000000000..3452cceb74e
--- /dev/null
+++ b/backends/metax_gpu/kernels/impl/metax_weight_quantize_kernel_impl.h
@@ -0,0 +1,150 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstdint>
+
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
+#include "paddle/phi/kernels/funcs/common_shape.h"
+
+namespace phi {
+
+void cpu_2d_tensor_transpose(const DenseTensor& input_data,
+                             DenseTensor* transposed_data) {
+  const int64_t input_data_rows = input_data.dims()[0];
+  const int64_t input_data_cols = input_data.dims()[1];
+
+  const int8_t* input_data_ptr = input_data.data<int8_t>();
+  int8_t* transposed_data_ptr = transposed_data->data<int8_t>();
+
+  for (int64_t r = 0; r < input_data_rows; r++) {
+    for (int64_t c = 0; c < input_data_cols; c++) {
+      *(transposed_data_ptr + r + c * input_data_rows) =
+          *(input_data_ptr + r * input_data_cols + c);
+    }
+  }
+}
+
+void cpu_int4_quanted_weight_raw_unpack(const DenseTensor& packed_data,
+                                        DenseTensor* unpacked_data) {
+  const int64_t packed_data_rows = packed_data.dims()[0];
+  const int64_t packed_data_cols = packed_data.dims()[1];
+
+  const int8_t* packed_data_ptr = packed_data.data<int8_t>();
+  int8_t* unpacked_data_ptr = unpacked_data->data<int8_t>();
+
+  for (int64_t c = 0; c < packed_data_cols; c++) {
+    for (int64_t r = 0; r < packed_data_rows; r++) {
+      int8_t val = *(packed_data_ptr + r * packed_data_cols + c);
+      int8_t low_int4 = val & 0x0f;
+      int8_t hight_int4 = (val >> 4) & 0x0f;
+
+      *(unpacked_data_ptr + (2 * r) * packed_data_cols + c) =
+          low_int4 >= 8 ? low_int4 - 16 : low_int4;
+      *(unpacked_data_ptr + (2 * r + 1) * packed_data_cols + c) =
+          hight_int4 >= 8 ? hight_int4 - 16 : hight_int4;
+    }
+  }
+}
+
+void cpu_int4_quanted_weight_col_pack(const DenseTensor& unpacked_data,
+                                      DenseTensor* packed_data) {
+  const int64_t packed_data_rows = packed_data->dims()[0];
+  const int64_t packed_data_cols = packed_data->dims()[1];
+
+  int8_t* packed_data_ptr = packed_data->data<int8_t>();
+  const int8_t* unpacked_data_ptr = unpacked_data.data<int8_t>();
+
+  for (int64_t r = 0; r < packed_data_rows; r++) {
+    for (int64_t c = 0; c < packed_data_cols; c++) {
+      int8_t low_int4 = *(unpacked_data_ptr + 2 * r * packed_data_cols + 2 * c);
+      int8_t hight_int4 =
+          *(unpacked_data_ptr + 2 * r * packed_data_cols + 2 * c + 1);
+
+      low_int4 = low_int4 < 0 ? low_int4 + 16 : low_int4;
+      hight_int4 = hight_int4 < 0 ? hight_int4 + 16 : hight_int4;
+
+      *(packed_data_ptr + r * packed_data_cols + c) =
+          ((hight_int4 << 4) & 0xf0) | (low_int4 & 0x0f);
+    }
+  }
+}
+
+void show_2d_cpu_tensor(const DenseTensor& tensor, const int64_t size = 3) {
+  const int64_t rows = tensor.dims()[0];
+  const int64_t cols = tensor.dims()[1];
+  printf("\nTensor shape = [%d, %d]\n", rows, cols);
+
+  const int8_t* cpu_ptr = tensor.data<int8_t>();
+
+  for (int r = 0; r < size; r++) {
+    for (int c = 0; c < size; c++) {
+      int8_t val = *(cpu_ptr + r * cols + c);
+      printf("%d ", val);
+    }
+    printf("\n");
+  }
+  printf("\n\n");
+}
+
+template <typename Context>
+void MetaxQuantizedWeightLayoutTrans(const Context& dev_ctx,
+                                     const std::string& algo,
+                                     const std::vector<int64_t>& shape,
+                                     DenseTensor* out) {
+  const int64_t m = shape[0];
+  const int64_t n = shape[1];
+
+  phi::CPUPlace cpu_place;
+
+  if (algo == "weight_only_int4") {
+    out->Resize({m / 2, n});
+
+    DenseTensor out_cpu_tensor;
+    phi::Copy(dev_ctx, (*out), cpu_place, true, &out_cpu_tensor);
+
+    // raw unpack
+    DenseTensor raw_unpack_tensor;
+    raw_unpack_tensor.Resize({out_cpu_tensor.dims()[0] * 2, n});
+    raw_unpack_tensor.mutable_data<int8_t>(cpu_place);
+    cpu_int4_quanted_weight_raw_unpack(out_cpu_tensor, &raw_unpack_tensor);
+
+    // transpose
+    DenseTensor transposed_tensor;
+    transposed_tensor.Resize(
+        {raw_unpack_tensor.dims()[1], raw_unpack_tensor.dims()[0]});
+    transposed_tensor.mutable_data<int8_t>(cpu_place);
+    cpu_2d_tensor_transpose(raw_unpack_tensor, &transposed_tensor);
+
+    // col pack
+    out_cpu_tensor.Resize(
+        {transposed_tensor.dims()[0], transposed_tensor.dims()[1] / 2});
+    cpu_int4_quanted_weight_col_pack(transposed_tensor, &out_cpu_tensor);
+
+    out_cpu_tensor.Resize({n / 2, m});
+    out->Resize({n / 2, m});
+    phi::Copy(dev_ctx, out_cpu_tensor, dev_ctx.GetPlace(), true, out);
+  } else {
+    PADDLE_FATAL(
+        "The algo must be in ['weight_only_int4'"
+        "], but got[%s]",
+        algo);
+  }
+}
+
+}  // namespace phi
diff --git a/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu
index cb80385a7a0..8d72ed2138e 100644
--- a/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu
@@ -11,6 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#include "../impl/metax_weight_quantize_kernel_impl.h"
 #include "paddle/common/enforce.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/datatype_traits.h"
@@ -120,7 +121,6 @@ void WeightQuantizeKernel(const Context& dev_ctx,
                                  weight_shape,
                                  arch,
                                  algo);
-    out->Resize({m / 2, n});
 #ifdef PADDLE_WITH_HIP
     DenseTensor x_int_tmp(out->type());
     x_int_tmp.Resize({m, n / 2});
@@ -141,6 +141,7 @@ void WeightQuantizeKernel(const Context& dev_ctx,
     //                             arch,
     //                             algo);
 #endif
+    MetaxQuantizedWeightLayoutTrans<Context>(dev_ctx, algo, weight_shape, out);
   } else if (algo == "w4a8") {
     weight_permute_gpu_w4a8<Context>(dev_ctx,
                                      x.data<int8_t>(),

From d85bc263ddfea142407b2fa9d6ee50d657e7693d Mon Sep 17 00:00:00 2001
From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Date: Wed, 29 Oct 2025 16:28:23 +0800
Subject: [PATCH 085/121] [Metax] add quanted weight layout transformation
 using GPU programming (#136)

* [Metax] add quanted weight layout transformation using GPU programming
---
 .../impl/metax_weight_quantize_kernel_impl.h  | 218 ++++++++++++++----
 1 file changed, 175 insertions(+), 43 deletions(-)

diff --git a/backends/metax_gpu/kernels/impl/metax_weight_quantize_kernel_impl.h b/backends/metax_gpu/kernels/impl/metax_weight_quantize_kernel_impl.h
index 3452cceb74e..b305ec96a30 100644
--- a/backends/metax_gpu/kernels/impl/metax_weight_quantize_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/metax_weight_quantize_kernel_impl.h
@@ -16,14 +16,60 @@
 
 #include <cstdint>
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace phi {
 
+void show_2d_cpu_tensor(const DenseTensor& tensor,
+                        const int64_t row_num = 3,
+                        const int64_t col_num = 3) {
+  const int64_t rows = tensor.dims()[0];
+  const int64_t cols = tensor.dims()[1];
+  printf("\nTensor shape = [%d, %d]\n", rows, cols);
+
+  const int8_t* cpu_ptr = tensor.data<int8_t>();
+
+  for (int r = 0; r < row_num; r++) {
+    for (int c = 0; c < col_num; c++) {
+      int8_t val = *(cpu_ptr + r * cols + c);
+      printf("%d ", val);
+    }
+    printf("\n");
+  }
+  printf("\n\n");
+}
+
+void show_2d_gpu_tensor(const CustomContext& dev_ctx,
+                        const DenseTensor& tensor,
+                        const int64_t row_num = 3,
+                        const int64_t col_num = 3) {
+  phi::CPUPlace cpu_place;
+
+  DenseTensor cpu_tensor;
+  phi::Copy(dev_ctx, tensor, cpu_place, true, &cpu_tensor);
+
+  const int64_t rows = cpu_tensor.dims()[0];
+  const int64_t cols = cpu_tensor.dims()[1];
+  printf("\nTensor shape = [%d, %d]\n", rows, cols);
+
+  const int8_t* cpu_ptr = cpu_tensor.data<int8_t>();
+
+  for (int r = 0; r < row_num; r++) {
+    for (int c = 0; c < col_num; c++) {
+      int8_t val = *(cpu_ptr + r * cols + c);
+      printf("%d ", val);
+    }
+    printf("\n");
+  }
+  printf("\n\n");
+}
+
 void cpu_2d_tensor_transpose(const DenseTensor& input_data,
                              DenseTensor* transposed_data) {
   const int64_t input_data_rows = input_data.dims()[0];
@@ -85,21 +131,132 @@ void cpu_int4_quanted_weight_col_pack(const DenseTensor& unpacked_data,
   }
 }
 
-void show_2d_cpu_tensor(const DenseTensor& tensor, const int64_t size = 3) {
-  const int64_t rows = tensor.dims()[0];
-  const int64_t cols = tensor.dims()[1];
-  printf("\nTensor shape = [%d, %d]\n", rows, cols);
+void cpu_int4_quantized_weight_layout_trans_impl(
+    const CustomContext& dev_ctx,
+    const std::vector<int64_t>& shape,
+    DenseTensor* out) {
+  const int64_t m = shape[0];
+  const int64_t n = shape[1];
 
-  const int8_t* cpu_ptr = tensor.data<int8_t>();
+  phi::CPUPlace cpu_place;
 
-  for (int r = 0; r < size; r++) {
-    for (int c = 0; c < size; c++) {
-      int8_t val = *(cpu_ptr + r * cols + c);
-      printf("%d ", val);
-    }
-    printf("\n");
+  out->Resize({m / 2, n});
+
+  DenseTensor out_cpu_tensor;
+  phi::Copy(dev_ctx, (*out), cpu_place, true, &out_cpu_tensor);
+
+  // raw unpack
+  DenseTensor raw_unpack_tensor;
+  raw_unpack_tensor.Resize({out_cpu_tensor.dims()[0] * 2, n});
+  raw_unpack_tensor.mutable_data<int8_t>(cpu_place);
+  cpu_int4_quanted_weight_raw_unpack(out_cpu_tensor, &raw_unpack_tensor);
+
+  // transpose
+  DenseTensor transposed_tensor;
+  transposed_tensor.Resize(
+      {raw_unpack_tensor.dims()[1], raw_unpack_tensor.dims()[0]});
+  transposed_tensor.mutable_data<int8_t>(cpu_place);
+  cpu_2d_tensor_transpose(raw_unpack_tensor, &transposed_tensor);
+
+  // col pack
+  out_cpu_tensor.Resize(
+      {transposed_tensor.dims()[0], transposed_tensor.dims()[1] / 2});
+  cpu_int4_quanted_weight_col_pack(transposed_tensor, &out_cpu_tensor);
+
+  out_cpu_tensor.Resize({n / 2, m});
+  out->Resize({n / 2, m});
+  phi::Copy(dev_ctx, out_cpu_tensor, dev_ctx.GetPlace(), true, out);
+}
+
+__global__ void int4_quanted_matrix_raw_unpack_kernel(const int8_t* mat,
+                                                      int8_t* unpack_mat,
+                                                      int M,
+                                                      int N) {
+  int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  int i = global_idx / N;
+  int j = global_idx % N;
+
+  if (global_idx >= M * N) {
+    return;
   }
-  printf("\n\n");
+
+  int8_t val = mat[global_idx];
+  int8_t low = val & 0x0F;
+  int8_t mask = ((low & 0x80) == 0) & ((low & 0x78) != 0);
+  low -= 16 * mask;
+
+  int8_t high = (val >> 4) & 0x0F;
+  mask = ((high & 0x80) == 0) & ((high & 0x78) != 0);
+  high -= 16 * mask;
+
+  int output_global_idx0 = (2 * i) * N + j;
+  int output_global_idx1 = (2 * i + 1) * N + j;
+
+  unpack_mat[output_global_idx0] = low;
+  unpack_mat[output_global_idx1] = high;
+}
+
+__global__ void int4_quanted_matrix_col_pack_kernel(const int8_t* mat,
+                                                    int8_t* pack_mat,
+                                                    int M,
+                                                    int N) {
+  int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  int i = global_idx / N;
+  int j = global_idx % N;
+
+  if (global_idx >= M * N) {
+    return;
+  }
+
+  int mat_global_idx0 = i * 2 * N + 2 * j;
+  int mat_global_idx1 = i * 2 * N + 2 * j + 1;
+
+  int8_t low = mat[mat_global_idx0] & 0x0F;
+  low = low + ((low >> 3) & 1) * 16;
+
+  int8_t high = mat[mat_global_idx1] & 0x0F;
+  high = high + ((high >> 3) & 1) * 16;
+
+  pack_mat[global_idx] = ((high << 4) & 0xf0) | (low & 0x0f);
+}
+
+void gpu_int4_quantized_weight_layout_trans_impl(
+    const CustomContext& dev_ctx,
+    const std::vector<int64_t>& shape,
+    DenseTensor* out) {
+  int64_t total_m = shape[0];
+  int64_t total_n = shape[1];
+  out->Resize({total_m / 2, total_n});
+
+  DenseTensor unpack_mat(out->type());
+  unpack_mat.Resize({total_m, total_n});
+  dev_ctx.template Alloc<int8_t>(&unpack_mat);
+
+  constexpr int kBlockSize = 64;
+  int64_t kGridSize = (out->numel() + kBlockSize - 1) / kBlockSize;
+  int4_quanted_matrix_raw_unpack_kernel<<<kGridSize, kBlockSize>>>(
+      out->data<int8_t>(),
+      unpack_mat.data<int8_t>(),
+      out->dims()[0],
+      out->dims()[1]);
+
+  DenseTensor transposed_tensor;
+  transposed_tensor.Resize({unpack_mat.dims()[1], unpack_mat.dims()[0]});
+  dev_ctx.template Alloc<int8_t>(&transposed_tensor);
+  std::vector<int> axis = {1, 0};
+  funcs::Transpose<CustomContext, int8_t, 2> trans;
+  trans(dev_ctx, unpack_mat, &transposed_tensor, axis);
+
+  out->Resize({transposed_tensor.dims()[0], transposed_tensor.dims()[1] / 2});
+  int4_quanted_matrix_col_pack_kernel<<<kGridSize, kBlockSize>>>(
+      transposed_tensor.data<int8_t>(),
+      out->data<int8_t>(),
+      out->dims()[0],
+      out->dims()[1]);
+
+  out->Resize({total_n / 2, total_m});
 }
 
 template <typename Context>
@@ -107,38 +264,13 @@ void MetaxQuantizedWeightLayoutTrans(const Context& dev_ctx,
                                      const std::string& algo,
                                      const std::vector<int64_t>& shape,
                                      DenseTensor* out) {
-  const int64_t m = shape[0];
-  const int64_t n = shape[1];
-
-  phi::CPUPlace cpu_place;
-
   if (algo == "weight_only_int4") {
-    out->Resize({m / 2, n});
-
-    DenseTensor out_cpu_tensor;
-    phi::Copy(dev_ctx, (*out), cpu_place, true, &out_cpu_tensor);
-
-    // raw unpack
-    DenseTensor raw_unpack_tensor;
-    raw_unpack_tensor.Resize({out_cpu_tensor.dims()[0] * 2, n});
-    raw_unpack_tensor.mutable_data<int8_t>(cpu_place);
-    cpu_int4_quanted_weight_raw_unpack(out_cpu_tensor, &raw_unpack_tensor);
-
-    // transpose
-    DenseTensor transposed_tensor;
-    transposed_tensor.Resize(
-        {raw_unpack_tensor.dims()[1], raw_unpack_tensor.dims()[0]});
-    transposed_tensor.mutable_data<int8_t>(cpu_place);
-    cpu_2d_tensor_transpose(raw_unpack_tensor, &transposed_tensor);
-
-    // col pack
-    out_cpu_tensor.Resize(
-        {transposed_tensor.dims()[0], transposed_tensor.dims()[1] / 2});
-    cpu_int4_quanted_weight_col_pack(transposed_tensor, &out_cpu_tensor);
-
-    out_cpu_tensor.Resize({n / 2, m});
-    out->Resize({n / 2, m});
-    phi::Copy(dev_ctx, out_cpu_tensor, dev_ctx.GetPlace(), true, out);
+    if (dev_ctx.GetPlace() == phi::CPUPlace()) {
+      cpu_int4_quantized_weight_layout_trans_impl(dev_ctx, shape, out);
+    } else {
+      gpu_int4_quantized_weight_layout_trans_impl(dev_ctx, shape, out);
+    }
+
   } else {
     PADDLE_FATAL(
         "The algo must be in ['weight_only_int4'"

From f62a59a0cca15919f1f66d189f88f2b567344b08 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Thu, 30 Oct 2025 12:44:13 +0800
Subject: [PATCH 086/121] [Metax] updata_softmax (#138)

* updata_softmax
---
 backends/metax_gpu/common/flags_declare.cc    | 12 ++++
 .../cuda_kernels/gammaln_grad_kernel.cu       | 28 ++++++++
 backends/metax_gpu/kernels/funcs/softmax.cu   |  3 +-
 .../kernels/gpudnn/softmax_kernel_dnn.cu      | 70 +++++++++++++++++++
 .../metax_kernel/softmax_kernel_register.cu   |  4 +-
 .../metax_kernel/svd_kernel_register.cu       | 66 ++++++++---------
 6 files changed, 146 insertions(+), 37 deletions(-)
 create mode 100644 backends/metax_gpu/kernels/cuda_kernels/gammaln_grad_kernel.cu
 create mode 100644 backends/metax_gpu/kernels/gpudnn/softmax_kernel_dnn.cu

diff --git a/backends/metax_gpu/common/flags_declare.cc b/backends/metax_gpu/common/flags_declare.cc
index fb656878033..0b65d635510 100644
--- a/backends/metax_gpu/common/flags_declare.cc
+++ b/backends/metax_gpu/common/flags_declare.cc
@@ -101,6 +101,18 @@ PHI_DEFINE_EXPORTED_bool(
     "faster but it may loss precision in most case. If true, the compute "
     "type will be set to fp16. Default is false.");
 
+/**
+ * Torch Compatible related FLAG
+ * Name: FLAGS_torch_compatible_kernel
+ * Since Version: 3.2.2
+ * Value Range: bool, default=false
+ * Example:
+ * Note: Whether use torch compatible version kernel.
+ */
+PHI_DEFINE_EXPORTED_bool(torch_compatible_kernel,
+                         false,
+                         "Whether use torch compatible version kernel.");
+
 PHI_DEFINE_EXPORTED_string(
     selected_gpus,
     "",
diff --git a/backends/metax_gpu/kernels/cuda_kernels/gammaln_grad_kernel.cu b/backends/metax_gpu/kernels/cuda_kernels/gammaln_grad_kernel.cu
new file mode 100644
index 00000000000..850f0d68bac
--- /dev/null
+++ b/backends/metax_gpu/kernels/cuda_kernels/gammaln_grad_kernel.cu
@@ -0,0 +1,28 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gammaln_grad_kernel.h"
+#include "paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h"
+
+PD_CUSTOM_KERNEL_REGISTER(gammaln_grad,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::GammalnGradKernel,
+                          float,
+                          double,
+                          phi::float16,
+                          phi::bfloat16) {}
diff --git a/backends/metax_gpu/kernels/funcs/softmax.cu b/backends/metax_gpu/kernels/funcs/softmax.cu
index 44bfd02a308..a587f9ed016 100644
--- a/backends/metax_gpu/kernels/funcs/softmax.cu
+++ b/backends/metax_gpu/kernels/funcs/softmax.cu
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <vector>
 
+#include "glog/logging.h"
 #include "kernels/metax_kernel/metax_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_dnn.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/softmax.h"
 #include "paddle/phi/kernels/funcs/softmax_impl.h"
-
 namespace phi {
 namespace funcs {
 
@@ -38,6 +38,7 @@ void SoftmaxCUDNNFunctor<T, DeviceContext>::operator()(
   ScopedTensorDescriptor yDesc;
   std::vector<int> cudnn_tensor_dims = common::vectorize<int>(X->dims());
   DataLayout layout = DataLayout::kNCHW;
+  VLOG(0) << "Enter softmax Kernel22.";
   if (cudnn_tensor_dims.size() == 5) {
     layout = DataLayout::kNCDHW;
   }
diff --git a/backends/metax_gpu/kernels/gpudnn/softmax_kernel_dnn.cu b/backends/metax_gpu/kernels/gpudnn/softmax_kernel_dnn.cu
new file mode 100644
index 00000000000..b51f92c96a4
--- /dev/null
+++ b/backends/metax_gpu/kernels/gpudnn/softmax_kernel_dnn.cu
@@ -0,0 +1,70 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "kernels/gpudnn/softmax_gpudnn.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/softmax_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SoftmaxGPUDNNKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         int axis,
+                         DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+  if (x.numel() == 0) return;
+
+  const int rank = x.dims().size();
+  // For 0D Tensor
+  if (rank == 0) {
+    phi::funcs::set_constant(dev_ctx, out, static_cast<T>(1.0));
+    return;
+  }
+
+  SoftmaxForwardCUDAKernelDriver<T>(dev_ctx, x, axis, out);
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_PLUGIN_KERNEL(softmax,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::SoftmaxGPUDNNKernel,
+                          float,
+                          phi::float16,
+                          phi::bfloat16) {}
+#else
+#if CUDNN_VERSION_MIN(8, 1, 0)
+PD_REGISTER_PLUGIN_KERNEL(softmax,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::SoftmaxGPUDNNKernel,
+                          float,
+                          double,
+                          phi::float16,
+                          phi::bfloat16) {}
+#else
+PD_REGISTER_PLUGIN_KERNEL(softmax,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::SoftmaxGPUDNNKernel,
+                          float,
+                          double,
+                          phi::float16) {}
+#endif
+#endif
diff --git a/backends/metax_gpu/kernels/metax_kernel/softmax_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/softmax_kernel_register.cu
index 0344a81dc19..523a2e4d76b 100644
--- a/backends/metax_gpu/kernels/metax_kernel/softmax_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/softmax_kernel_register.cu
@@ -11,7 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
+#if 0
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
@@ -27,3 +27,5 @@ PD_REGISTER_PLUGIN_KERNEL(softmax,
                           double,
                           phi::dtype::float16,
                           phi::dtype::bfloat16) {}
+
+#endif
diff --git a/backends/metax_gpu/kernels/metax_kernel/svd_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/svd_kernel_register.cu
index 5f9d6cc20e0..c8ece09bbae 100644
--- a/backends/metax_gpu/kernels/metax_kernel/svd_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/svd_kernel_register.cu
@@ -15,7 +15,7 @@
 #ifndef PADDLE_WITH_HIP
 // HIP not support cusolver
 
-#include "kernels/impl/values_vectors_functor.h"
+#include "kernels/metax_kernel/metax_context.h"
 #include "paddle/phi/backends/dynload/cusolver.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -60,7 +60,6 @@ void GesvdjBatched<float>(const phi::GPUContext& dev_ctx,
   int ldu = m;
   int ldt = n;
   int lwork = 0;
-  // auto handle = dev_ctx.cusolver_dn_handle();
   auto handle = GetCusolverDnHandle(dev_ctx.stream(), dev_ctx.GetPlace());
   PADDLE_ENFORCE_GPU_SUCCESS(
       phi::dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params));
@@ -142,7 +141,6 @@ void GesvdjBatched<double>(const phi::GPUContext& dev_ctx,
   int ldu = m;
   int ldt = n;
   int lwork = 0;
-  // auto handle = dev_ctx.cusolver_dn_handle();
   auto handle = GetCusolverDnHandle(dev_ctx.stream(), dev_ctx.GetPlace());
   PADDLE_ENFORCE_GPU_SUCCESS(
       phi::dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params));
@@ -205,17 +203,17 @@ void GesvdjBatched<double>(const phi::GPUContext& dev_ctx,
 }
 
 template <>
-void GesvdjBatched<phi::dtype::complex<float>>(const phi::GPUContext& dev_ctx,
-                                               int batchSize,
-                                               int m,
-                                               int n,
-                                               int k,
-                                               phi::dtype::complex<float>* A,
-                                               phi::dtype::complex<float>* U,
-                                               phi::dtype::complex<float>* V,
-                                               float* S,
-                                               int* info,
-                                               int thin_UV) {
+void GesvdjBatched<phi::complex64>(const phi::GPUContext& dev_ctx,
+                                   int batchSize,
+                                   int m,
+                                   int n,
+                                   int k,
+                                   phi::complex64* A,
+                                   phi::complex64* U,
+                                   phi::complex64* V,
+                                   float* S,
+                                   int* info,
+                                   int thin_UV) {
   /* compute singular vectors */
   const cusolverEigMode_t jobz =
       CUSOLVER_EIG_MODE_VECTOR; /* compute singular vectors */
@@ -224,7 +222,6 @@ void GesvdjBatched<phi::dtype::complex<float>>(const phi::GPUContext& dev_ctx,
   int ldu = m;
   int ldt = n;
   int lwork = 0;
-  // auto handle = dev_ctx.cusolver_dn_handle();
   auto handle = GetCusolverDnHandle(dev_ctx.stream(), dev_ctx.GetPlace());
   PADDLE_ENFORCE_GPU_SUCCESS(
       phi::dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params));
@@ -245,10 +242,10 @@ void GesvdjBatched<phi::dtype::complex<float>>(const phi::GPUContext& dev_ctx,
       gesvdj_params));
   auto workspace = phi::memory_utils::Alloc(
       dev_ctx.GetPlace(),
-      lwork * sizeof(phi::dtype::complex<float>),
+      lwork * sizeof(phi::complex64),
       phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
-  phi::dtype::complex<float>* workspace_ptr =
-      reinterpret_cast<phi::dtype::complex<float>*>(workspace->ptr());
+  phi::complex64* workspace_ptr =
+      reinterpret_cast<phi::complex64*>(workspace->ptr());
   int stride_A = lda * n;
   int stride_U = ldu * (thin_UV ? k : m);
   int stride_V = ldt * (thin_UV ? k : n);
@@ -289,17 +286,17 @@ void GesvdjBatched<phi::dtype::complex<float>>(const phi::GPUContext& dev_ctx,
 }
 
 template <>
-void GesvdjBatched<phi::dtype::complex<double>>(const phi::GPUContext& dev_ctx,
-                                                int batchSize,
-                                                int m,
-                                                int n,
-                                                int k,
-                                                phi::dtype::complex<double>* A,
-                                                phi::dtype::complex<double>* U,
-                                                phi::dtype::complex<double>* V,
-                                                double* S,
-                                                int* info,
-                                                int thin_UV) {
+void GesvdjBatched<phi::complex128>(const phi::GPUContext& dev_ctx,
+                                    int batchSize,
+                                    int m,
+                                    int n,
+                                    int k,
+                                    phi::complex128* A,
+                                    phi::complex128* U,
+                                    phi::complex128* V,
+                                    double* S,
+                                    int* info,
+                                    int thin_UV) {
   /* compute singular vectors */
   const cusolverEigMode_t jobz =
       CUSOLVER_EIG_MODE_VECTOR; /* compute singular vectors */
@@ -308,7 +305,6 @@ void GesvdjBatched<phi::dtype::complex<double>>(const phi::GPUContext& dev_ctx,
   int ldu = m;
   int ldt = n;
   int lwork = 0;
-  // auto handle = dev_ctx.cusolver_dn_handle();
   auto handle = GetCusolverDnHandle(dev_ctx.stream(), dev_ctx.GetPlace());
   PADDLE_ENFORCE_GPU_SUCCESS(
       phi::dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params));
@@ -329,10 +325,10 @@ void GesvdjBatched<phi::dtype::complex<double>>(const phi::GPUContext& dev_ctx,
       gesvdj_params));
   auto workspace = phi::memory_utils::Alloc(
       dev_ctx.GetPlace(),
-      lwork * sizeof(phi::dtype::complex<double>),
+      lwork * sizeof(phi::complex128),
       phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
-  phi::dtype::complex<double>* workspace_ptr =
-      reinterpret_cast<phi::dtype::complex<double>*>(workspace->ptr());
+  phi::complex128* workspace_ptr =
+      reinterpret_cast<phi::complex128*>(workspace->ptr());
   int stride_A = lda * n;
   int stride_U = ldu * (thin_UV ? k : m);
   int stride_V = ldt * (thin_UV ? k : n);
@@ -432,7 +428,7 @@ PD_REGISTER_PLUGIN_KERNEL(svd,  // cuda_only
                           phi::SvdKernel,
                           float,
                           double,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::complex64,
+                          phi::complex128) {}
 
 #endif  // not PADDLE_WITH_HIP

From e96db665ab94ae14179a4036f0f94d178023741b Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Thu, 30 Oct 2025 15:05:39 +0800
Subject: [PATCH 087/121] udata patch (#139)

* updata_patch

---------
---
 backends/metax_gpu/patch/paddle.patch | 131 ++++++++++++++++++++++----
 1 file changed, 113 insertions(+), 18 deletions(-)

diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch
index 6578029129e..fe0d9e104a5 100755
--- a/backends/metax_gpu/patch/paddle.patch
+++ b/backends/metax_gpu/patch/paddle.patch
@@ -18,6 +18,22 @@ index cfada544d4..a690e97d74 100644
  endif()
  
  set(EIGEN_INCLUDE_DIR ${SOURCE_DIR})
+diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt
+index 99a0116d92..2566e7c41a 100755
+--- a/paddle/fluid/operators/fused/CMakeLists.txt
++++ b/paddle/fluid/operators/fused/CMakeLists.txt
+@@ -43,6 +43,11 @@ if(WITH_GPU OR WITH_ROCM)
+     op_library(fused_multi_transformer_int8_op)
+   endif()
+ 
++  if 1
++  op_library(fused_gemm_epilogue_op)
++  endif()
++
++
+   if(CUDA_VERSION GREATER_EQUAL 11.6)
+     op_library(fused_gemm_epilogue_op)
+   endif()
 diff --git a/paddle/fluid/platform/profiler/cupti_data_process.cc b/paddle/fluid/platform/profiler/cupti_data_process.cc
 index bff0f2bf70..9376b5781f 100644
 --- a/paddle/fluid/platform/profiler/cupti_data_process.cc
@@ -441,10 +457,38 @@ index 024a7de73e..66b373d698 100644
    } while (0)
  #elif defined(__HIPCC__)
 diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
-index ae7b67de6d..fbe9f67737 100644
+index ae7b67de6d..9ac725314f 100644
 --- a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
 +++ b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
-@@ -368,7 +368,7 @@ struct CUBlas<phi::float16> {
+@@ -218,11 +218,27 @@ struct CUBlas<float> {
+   }
+ };
+ 
++template<typename... Args>
++void print_args(Args... args) {
++    std::cout << "Arguments (" << sizeof...(args) << "): [";
++    bool first = true;
++    auto printer = [&first](const auto& arg) {
++        if (!first) std::cout << ", ";
++        std::cout << arg;
++        first = false;
++    };
++    (printer(args), ...);
++    std::cout << "]" << std::endl;
++}
++
+ template <>
+ struct CUBlas<double> {
+   template <typename... ARGS>
+   static void GEMM(ARGS... args) {
++    // print_args(args...);
+     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasDgemm(args...));
++    
++    
+   }
+ 
+   template <typename... ARGS>
+@@ -368,7 +384,7 @@ struct CUBlas<phi::float16> {
                           cudaDataType_t Ctype,
                           int ldc,
                           int batchCount,
@@ -453,7 +497,7 @@ index ae7b67de6d..fbe9f67737 100644
  #if CUDA_VERSION >= 8000
      cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
  #if CUDA_VERSION >= 9000
-@@ -476,7 +476,7 @@ struct CUBlas<phi::float16> {
+@@ -476,7 +492,7 @@ struct CUBlas<phi::float16> {
                        void *C,
                        cudaDataType_t Ctype,
                        int ldc,
@@ -462,7 +506,7 @@ index ae7b67de6d..fbe9f67737 100644
  #if CUDA_VERSION >= 8000
      cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
  #if CUDA_VERSION >= 9000
-@@ -532,7 +532,7 @@ struct CUBlas<phi::float16> {
+@@ -532,7 +548,7 @@ struct CUBlas<phi::float16> {
                           void *C,
                           cudaDataType_t Ctype,
                           int64_t ldc,
@@ -471,7 +515,7 @@ index ae7b67de6d..fbe9f67737 100644
  #if CUDA_VERSION >= 12030 && defined(__linux__)
      cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
      bool use_tensor_op_math = dev_ctx->tensor_core_available();
-@@ -759,7 +759,7 @@ struct CUBlas<phi::complex64> {
+@@ -759,7 +775,7 @@ struct CUBlas<phi::complex64> {
                        void *C,
                        cudaDataType_t Ctype,
                        int ldc,
@@ -480,7 +524,7 @@ index ae7b67de6d..fbe9f67737 100644
  #if CUDA_VERSION >= 8000
      cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
  #if CUDA_VERSION >= 9000
-@@ -815,7 +815,7 @@ struct CUBlas<phi::complex64> {
+@@ -815,7 +831,7 @@ struct CUBlas<phi::complex64> {
                           void *C,
                           cudaDataType_t Ctype,
                           int64_t ldc,
@@ -489,7 +533,7 @@ index ae7b67de6d..fbe9f67737 100644
  #if CUDA_VERSION >= 12030 && defined(__linux__)
      cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
      bool use_tensor_op_math = dev_ctx->tensor_core_available();
-@@ -1154,7 +1154,7 @@ struct CUBlas<phi::complex128> {
+@@ -1154,7 +1170,7 @@ struct CUBlas<phi::complex128> {
                        void *C,
                        cudaDataType_t Ctype,
                        int ldc,
@@ -498,7 +542,7 @@ index ae7b67de6d..fbe9f67737 100644
  #if CUDA_VERSION >= 8000
      cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
  #if CUDA_VERSION >= 9000
-@@ -1210,7 +1210,7 @@ struct CUBlas<phi::complex128> {
+@@ -1210,7 +1226,7 @@ struct CUBlas<phi::complex128> {
                           void *C,
                           cudaDataType_t Ctype,
                           int64_t ldc,
@@ -507,7 +551,7 @@ index ae7b67de6d..fbe9f67737 100644
  #if CUDA_VERSION >= 12030 && defined(__linux__)
      cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
      bool use_tensor_op_math = dev_ctx->tensor_core_available();
-@@ -1484,7 +1484,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+@@ -1484,7 +1500,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
                                       C,
                                       CUDA_R_16F,
                                       N,
@@ -516,7 +560,7 @@ index ae7b67de6d..fbe9f67737 100644
  #else
      PADDLE_THROW(common::errors::Unimplemented(
          "GEMM_EX_64 is not supported on cuda < 12.3"));
-@@ -1508,7 +1508,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+@@ -1508,7 +1524,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
                                    C,
                                    CUDA_R_16F,
                                    static_cast<int>(N),
@@ -525,7 +569,7 @@ index ae7b67de6d..fbe9f67737 100644
    }
  #else
    // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
-@@ -1694,7 +1694,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+@@ -1694,7 +1710,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
                                       C,
                                       CUDA_R_16F,
                                       N,
@@ -534,7 +578,7 @@ index ae7b67de6d..fbe9f67737 100644
  #else
      PADDLE_THROW(common::errors::Unimplemented(
          "GEMM_EX_64 is not supported on cuda < 12.3"));
-@@ -1719,7 +1719,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+@@ -1719,7 +1735,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
                                    C,
                                    CUDA_R_16F,
                                    static_cast<int>(N),
@@ -543,7 +587,7 @@ index ae7b67de6d..fbe9f67737 100644
  #else
      // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
      dev_ctx_.CublasCall([&](cublasHandle_t handle) {
-@@ -1831,7 +1831,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+@@ -1831,7 +1847,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
                                       C,
                                       CUDA_R_16BF,
                                       static_cast<int>(N),
@@ -552,7 +596,7 @@ index ae7b67de6d..fbe9f67737 100644
                                       algo));
      });
    }
-@@ -1932,7 +1932,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+@@ -1932,7 +1948,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
                                       C,
                                       CUDA_R_16BF,
                                       static_cast<int>(N),
@@ -561,7 +605,7 @@ index ae7b67de6d..fbe9f67737 100644
                                       algo));
      });
    }
-@@ -2026,7 +2026,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+@@ -2026,7 +2042,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
                                      C,
                                      CUDA_C_32F,
                                      static_cast<int>(N),
@@ -570,7 +614,7 @@ index ae7b67de6d..fbe9f67737 100644
  
  #else
      dev_ctx_.CublasCall([&](cublasHandle_t handle) {
-@@ -2111,7 +2111,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+@@ -2111,7 +2127,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
                                          C,
                                          CUDA_C_64F,
                                          N,
@@ -579,7 +623,7 @@ index ae7b67de6d..fbe9f67737 100644
  #else
      PADDLE_THROW(common::errors::Unimplemented(
          "GEMM_EX_64 is not supported on cuda < 12.3"));
-@@ -2136,7 +2136,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+@@ -2136,7 +2152,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
                                       C,
                                       CUDA_C_64F,
                                       static_cast<int>(N),
@@ -588,7 +632,25 @@ index ae7b67de6d..fbe9f67737 100644
  #else  // CUDA_VERSION >= 8000
      // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
      dev_ctx_.CublasCall([&](cublasHandle_t handle) {
-@@ -3129,7 +3129,7 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+@@ -2272,7 +2288,7 @@ inline void Blas<phi::GPUContext>::GEMM(bool transA,
+                                                           C,
+                                                           CUDA_R_16F,
+                                                           ldc,
+-                                                          CUDA_R_32F,
++                                                          CUBLAS_COMPUTE_32F,
+                                                           algo));
+   });
+ }
+@@ -2334,7 +2350,7 @@ inline void Blas<phi::GPUContext>::GEMM(bool transA,
+                                                           C,
+                                                           CUDA_R_16BF,
+                                                           ldc,
+-                                                          CUDA_R_32F,
++                                                          CUBLAS_COMPUTE_32F,
+                                                           algo));
+   });
+ #else
+@@ -3129,7 +3145,7 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
                                     CUDA_R_16F,
                                     ldc,
                                     batchCount,
@@ -597,6 +659,15 @@ index ae7b67de6d..fbe9f67737 100644
  }
  
  template <>
+@@ -3197,7 +3213,7 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+                                           CUDA_R_16BF,
+                                           ldc,
+                                           batchCount,
+-                                          CUDA_R_32F,
++                                          CUBLAS_COMPUTE_32F,
+                                           algo));
+   });
+ #else
 diff --git a/paddle/phi/kernels/funcs/blas/blaslt_gemm_search.h b/paddle/phi/kernels/funcs/blas/blaslt_gemm_search.h
 index e63b3d2f6e..95d7e6f204 100644
 --- a/paddle/phi/kernels/funcs/blas/blaslt_gemm_search.h
@@ -1129,3 +1200,27 @@ index e6b3960f6d..564125f1f6 100644
  
    if ((x <= T{0}) || (a <= T{0})) return (T{1.0});
  
+diff --git a/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h b/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h
+index 410fb3c560..7d173d46f5 100644
+--- a/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h
++++ b/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h
+@@ -20,8 +20,8 @@
+ namespace phi {
+ template <typename T>
+ HOSTDEVICE T digamma_positive_domain(T x) {
+-  static T c = T{8.5};
+-  static T euler_mascheroni = T{0.57721566490153286060};
++  const static T c = T{8.5};
++  const static T euler_mascheroni = T{0.57721566490153286060};
+   T r;
+   T value;
+   T x2;
+@@ -54,7 +54,7 @@ HOSTDEVICE T digamma_positive_domain(T x) {
+ 
+ template <typename T>
+ HOSTDEVICE T digamma(T x) {
+-  static T pi = T{3.14159265358979323846};
++  const static T pi = T{3.14159265358979323846};
+ 
+   if (x == T{0.0}) {
+     T inf = std::numeric_limits<T>::infinity();

From a6d4b7db716bad842731b5abdab91e110af33e35 Mon Sep 17 00:00:00 2001
From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Date: Thu, 30 Oct 2025 16:01:05 +0800
Subject: [PATCH 088/121] [Metax] optimize wint4 quantization implementation
 (#140)

* [Metax] optimize wint4 quantization implementation
---
 .../impl/metax_weight_quantize_kernel_impl.h  | 37 +++++++++++++++----
 .../weight_quantize_kernel_register.cu        | 10 ++++-
 2 files changed, 37 insertions(+), 10 deletions(-)

diff --git a/backends/metax_gpu/kernels/impl/metax_weight_quantize_kernel_impl.h b/backends/metax_gpu/kernels/impl/metax_weight_quantize_kernel_impl.h
index b305ec96a30..9aedba871c5 100644
--- a/backends/metax_gpu/kernels/impl/metax_weight_quantize_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/metax_weight_quantize_kernel_impl.h
@@ -25,7 +25,7 @@
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace phi {
-
+template <typename DataType>
 void show_2d_cpu_tensor(const DenseTensor& tensor,
                         const int64_t row_num = 3,
                         const int64_t col_num = 3) {
@@ -33,18 +33,18 @@ void show_2d_cpu_tensor(const DenseTensor& tensor,
   const int64_t cols = tensor.dims()[1];
   printf("\nTensor shape = [%d, %d]\n", rows, cols);
 
-  const int8_t* cpu_ptr = tensor.data<int8_t>();
+  const DataType* cpu_ptr = tensor.data<DataType>();
 
   for (int r = 0; r < row_num; r++) {
     for (int c = 0; c < col_num; c++) {
-      int8_t val = *(cpu_ptr + r * cols + c);
-      printf("%d ", val);
+      DataType val = *(cpu_ptr + r * cols + c);
+      printf("%#x ", val);
     }
     printf("\n");
   }
   printf("\n\n");
 }
-
+template <typename DataType>
 void show_2d_gpu_tensor(const CustomContext& dev_ctx,
                         const DenseTensor& tensor,
                         const int64_t row_num = 3,
@@ -58,18 +58,39 @@ void show_2d_gpu_tensor(const CustomContext& dev_ctx,
   const int64_t cols = cpu_tensor.dims()[1];
   printf("\nTensor shape = [%d, %d]\n", rows, cols);
 
-  const int8_t* cpu_ptr = cpu_tensor.data<int8_t>();
+  const DataType* cpu_ptr = cpu_tensor.data<DataType>();
 
   for (int r = 0; r < row_num; r++) {
     for (int c = 0; c < col_num; c++) {
-      int8_t val = *(cpu_ptr + r * cols + c);
-      printf("%d ", val);
+      DataType val = *(cpu_ptr + r * cols + c);
+      printf("%#x ", val);
     }
     printf("\n");
   }
   printf("\n\n");
 }
 
+template <typename DataType>
+void show_1d_gpu_tensor(const CustomContext& dev_ctx,
+                        const DenseTensor& tensor,
+                        const int64_t num = 3) {
+  phi::CPUPlace cpu_place;
+
+  DenseTensor cpu_tensor;
+  phi::Copy(dev_ctx, tensor, cpu_place, true, &cpu_tensor);
+
+  const int64_t nums = cpu_tensor.numel();
+  printf("\nTensor shape = [%d]\n", nums);
+
+  const DataType* cpu_ptr = cpu_tensor.data<DataType>();
+
+  for (int n = 0; n < num; n++) {
+    DataType val = *(cpu_ptr + n);
+    printf("%#x ", val);
+  }
+  printf("\n\n");
+}
+
 void cpu_2d_tensor_transpose(const DenseTensor& input_data,
                              DenseTensor* transposed_data) {
   const int64_t input_data_rows = input_data.dims()[0];
diff --git a/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu
index 8d72ed2138e..efc18693e21 100644
--- a/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu
@@ -116,7 +116,7 @@ void WeightQuantizeKernel(const Context& dev_ctx,
     dev_ctx.template Alloc<T>(scale);
     weight_quant_gpu<T, Context>(dev_ctx,
                                  x.data<T>(),
-                                 out->data<int8_t>(),
+                                 quanted_x.data<int8_t>(),
                                  scale->data<T>(),
                                  weight_shape,
                                  arch,
@@ -141,7 +141,13 @@ void WeightQuantizeKernel(const Context& dev_ctx,
     //                             arch,
     //                             algo);
 #endif
-    MetaxQuantizedWeightLayoutTrans<Context>(dev_ctx, algo, weight_shape, out);
+    quanted_x.Resize({m / 2, n});
+
+    std::vector<int> axis = {1, 0};
+    funcs::Transpose<Context, int8_t, 2> trans;
+    trans(dev_ctx, quanted_x, out, axis);
+
+    out->Resize({n / 2, m});
   } else if (algo == "w4a8") {
     weight_permute_gpu_w4a8<Context>(dev_ctx,
                                      x.data<int8_t>(),

From d575c15d592f566f1e1bef984cbb56b51c13887c Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Thu, 30 Oct 2025 17:34:02 +0800
Subject: [PATCH 089/121] change_flag (#141)

* change_flag
---
 backends/metax_gpu/common/flags_declare.cc | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/backends/metax_gpu/common/flags_declare.cc b/backends/metax_gpu/common/flags_declare.cc
index 0b65d635510..fb656878033 100644
--- a/backends/metax_gpu/common/flags_declare.cc
+++ b/backends/metax_gpu/common/flags_declare.cc
@@ -101,18 +101,6 @@ PHI_DEFINE_EXPORTED_bool(
     "faster but it may loss precision in most case. If true, the compute "
     "type will be set to fp16. Default is false.");
 
-/**
- * Torch Compatible related FLAG
- * Name: FLAGS_torch_compatible_kernel
- * Since Version: 3.2.2
- * Value Range: bool, default=false
- * Example:
- * Note: Whether use torch compatible version kernel.
- */
-PHI_DEFINE_EXPORTED_bool(torch_compatible_kernel,
-                         false,
-                         "Whether use torch compatible version kernel.");
-
 PHI_DEFINE_EXPORTED_string(
     selected_gpus,
     "",

From 6a5c6c984111355e01ef89d04966f4f7b9e11677 Mon Sep 17 00:00:00 2001
From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Date: Fri, 31 Oct 2025 16:58:48 +0800
Subject: [PATCH 090/121] [Metax] register fused_fc_elementwise_layernorm
 kernel (#143)

* [Metax] register fused_fc_elementwise_layernorm kernel
---
 ...c_elementwise_layernorm_kernel_register.cu | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)
 create mode 100644 backends/metax_gpu/kernels/cuda_kernels/fused_fc_elementwise_layernorm_kernel_register.cu

diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_fc_elementwise_layernorm_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/fused_fc_elementwise_layernorm_kernel_register.cu
new file mode 100644
index 00000000000..f52b0cc4b78
--- /dev/null
+++ b/backends/metax_gpu/kernels/cuda_kernels/fused_fc_elementwise_layernorm_kernel_register.cu
@@ -0,0 +1,23 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/fusion/gpu/fused_fc_elementwise_layernorm_kernel.cu"  // NOLINT
+
+PD_CUSTOM_KERNEL_REGISTER(fused_fc_elementwise_layernorm,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::fusion::FusedFCElementwiseLayerNormKernel,
+                          float,
+                          double,
+                          phi::float16) {}

From d09dc9b979db97ab6aa1e7d0e7a2c58ad37e0c5d Mon Sep 17 00:00:00 2001
From: Metax_paddle <1640472053@qq.com>
Date: Fri, 31 Oct 2025 17:30:01 +0800
Subject: [PATCH 091/121] updata paddle

---
 Paddle | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Paddle b/Paddle
index 1f00e2178ad..2b9ba85d9c5 160000
--- a/Paddle
+++ b/Paddle
@@ -1 +1 @@
-Subproject commit 1f00e2178ad3249ecd8bb83e59bc6ac1ebcac413
+Subproject commit 2b9ba85d9c512c05e20b38ea822dc808e410609f

From b02687e09c6252fe2cd6206077d536340b20e8d9 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Wed, 5 Nov 2025 10:28:25 +0800
Subject: [PATCH 092/121] [Metax] add private CI (#144)

* [Metax] add private CI
---
 .github/workflows/metax_work_private.yaml     |  96 ++++++++++++++++
 backends/metax_gpu/build_private_CI.sh        |  91 +++++++++++++++
 backends/metax_gpu/patch/paddle.patch         | 106 ++----------------
 .../metax_gpu/runtime/process_cupti_data.cc   |  83 --------------
 backends/metax_gpu/tests/run_test.sh          |   5 +-
 5 files changed, 200 insertions(+), 181 deletions(-)
 create mode 100644 .github/workflows/metax_work_private.yaml
 create mode 100644 backends/metax_gpu/build_private_CI.sh

diff --git a/.github/workflows/metax_work_private.yaml b/.github/workflows/metax_work_private.yaml
new file mode 100644
index 00000000000..afe6fd5c30d
--- /dev/null
+++ b/.github/workflows/metax_work_private.yaml
@@ -0,0 +1,96 @@
+name: paddle metax gpu private test
+
+on:
+  workflow_dispatch:
+  pull_request:
+    types: [opened, synchronize]
+    branches: [develop, release/**]
+permissions: read-all
+
+defaults:
+  run:
+    shell: bash
+
+jobs:
+  metax-gpu-test:
+    runs-on: paddle-metax-runner-set
+    # runs-on: debug-paddle-runner-set
+    steps:
+      - name: Checkout repository
+        run: |
+          git config --global user.name "GitHub Actions"
+          git config --global user.email "actions@github.com"
+
+          git clone \
+            --reference-if-able /home/runner/PaddleCustomDevice \
+            --depth=1 \
+            --shallow-submodules \
+            --jobs=8 \
+            --branch ${{ github.base_ref || github.ref_name}} \
+            --recurse-submodules \
+            https://${{ github.actor }}:${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }}.git .
+
+          if [ "${{ github.event_name }}" == "pull_request" ]; then
+            git fetch origin pull/${{ github.event.pull_request.number }}/head:pull/${{ github.event.pull_request.number }}/head
+            git checkout pull/${{ github.event.pull_request.number }}/head
+
+
+
+
+            paddle_branch=${{ github.base_ref || github.ref_name}}
+            echo $paddle_branch
+            # sleep 10000
+            change_numbers=$(git diff --name-only remotes/origin/${paddle_branch} | wc -l)
+            echo $change_numbers
+
+
+            change_backend=$(git diff --name-only remotes/origin/${paddle_branch} | grep -c "backends/" || true)
+            echo $change_backend
+            change_metax_only=$(git diff --name-only remotes/origin/${paddle_branch} | grep -c "backends/metax_gpu" || true)
+            echo $change_metax_only
+
+            # change_backend=$(git diff --name-only remotes/origin/${paddle_branch} | grep "backends/"| wc -l)
+            # echo $change_backend
+            # change_metax_only=$(git diff --name-only remotes/origin/${paddle_branch} | grep "backends/metax_gpu"| wc -l)
+            # echo $change_metax_only
+
+            git diff --name-only remotes/origin/${paddle_branch}
+
+            if [ $change_numbers -ne $change_backend ]; then
+              echo "Common file changed, continue to run metax FULL CI test ..."
+            elif [ $paddle_branch -eq 0 ] ; then
+              echo "NO metax backend changes found, skip metax FULL CI ....."
+              exit 0
+            fi
+
+
+            # git submodule update --init --recursive
+          fi
+
+
+      - name: compile
+        run: |
+          # sleep 10000
+          cd backends/metax_gpu
+          bash build_private_CI.sh
+
+      - name: run test
+
+        run: |
+          cd backends/metax_gpu/tests
+          bash run_test.sh -j 16
+
+      - name: push whl
+        env:
+          PR_ID: ${{ github.event.pull_request.number }}
+          COMMIT_ID: ${{ github.event.pull_request.head.sha }}
+        run: |
+          pip install bce-python-sdk==0.8.74
+          export AK=paddle
+          export SK=paddle
+          if [ ! -f "BosClient.py}" ]; then
+            wget -q --no-proxy https://xly-devops.bj.bcebos.com/home/bos_retry.tar.gz --no-check-certificate
+            tar xf bos_retry.tar.gz
+          fi
+          cp backends/metax_gpu/build/dist/paddle_metax_gpu*.whl .
+          python BosClient.py paddle_metax_gpu*.whl paddle-github-action/PaddleCustomDevice/metax_gpu/${PR_ID}/${COMMIT_ID}
diff --git a/backends/metax_gpu/build_private_CI.sh b/backends/metax_gpu/build_private_CI.sh
new file mode 100644
index 00000000000..edbb326e081
--- /dev/null
+++ b/backends/metax_gpu/build_private_CI.sh
@@ -0,0 +1,91 @@
+# 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved.
+#!/bin/bash
+
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+# uninstall paddle
+pip  uninstall paddlepaddle -y
+
+
+#!/bin/bash
+
+# update_paddle_dev.sh
+
+chown -R $USER:$USER ../../Paddle/
+chown -R $USER:$USER ../../../PaddleCustomDevice/
+# Step 1: 撤销所有本地修改（已跟踪的文件，不包括新文件）
+cd ../../Paddle/
+echo "🔄 正在撤销所有本地修改（git checkout .）..."
+git checkout develop
+git checkout .
+
+# Step 2: 拉取远程最新的 dev (通常是 develop) 分支代码
+echo "🌐 正在拉取远程最新的 dev (develop) 分支代码..."
+
+
+# 拉取 develop 分支的最新代码（与远程同步）
+git pull origin develop
+
+echo "🔗 当前分支: $(git branch --show-current)"
+echo "📌 最新 commit hash (短): $(git rev-parse --short HEAD)"
+echo "📌 最新 commit 信息:"
+git log -1 --oneline
+
+# 提示完成
+echo "✅ 脚本执行完毕！"
+echo "📌 已撤销本地修改，并更新到 Paddle 最新的 develop (dev) 分支代码。"
+
+
+pip install safetensors==0.6.2 -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple some-package
+# install paddle
+
+python -m pip install --pre paddlepaddle -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/
+
+
+# unset http_proxy https_proxy
+cd -
+# apply patch
+bash change_patch.sh
+
+export MACA_PATH=/opt/maca
+export CUDA_PATH=/workspace/cuda-11.7/
+export PATH=${CUDA_PATH}/bin:${PATH}
+export CUCC_PATH=${MACA_PATH}/tools/cu-bridge
+export PATH=${PATH}:${CUCC_PATH}/tools:${CUCC_PATH}/bin
+export PATH=${MACA_PATH}/bin:${PATH}
+export LD_LIBRARY_PATH=${MACA_PATH}/lib:${MACA_PATH}/mxgpu_llvm/lib:${LD_LIBRARY_PATH}
+export PADDLE_VERSION=3.3.0
+
+if [ ! -d build ]; then
+    echo "build directory not found, creating..."
+    mkdir build
+fi
+
+echo "make_maca"
+cd build
+cmake_maca .. -DCMAKE_BUILD_TYPE=Release -DPython3_EXECUTABLE=$(which python3) -DWITH_GPU=ON
+make_maca -j60
+
+echo "install whl"
+pip install dist/paddle_metax_gpu*.whl --force-reinstall
+cd ..
+echo "Done!"
+
+cd build/dist/
+ossutil ls oss://opensource-ci/paddle/
+ossutil cat oss://opensource-ci/paddle/test1
+ossutil cp ./paddle_metax_gpu-*.whl oss://opensource-ci/paddle/test1
+cd -
diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch
index fe0d9e104a5..8cd18045094 100755
--- a/backends/metax_gpu/patch/paddle.patch
+++ b/backends/metax_gpu/patch/paddle.patch
@@ -48,7 +48,7 @@ index bff0f2bf70..9376b5781f 100644
  #include "paddle/phi/core/platform/device/gpu/gpu_info.h"
  #include "paddle/phi/core/platform/profiler/utils.h"
 diff --git a/paddle/phi/backends/dynload/cublas.h b/paddle/phi/backends/dynload/cublas.h
-index 62beb53cfe..0b0ac09fc0 100644
+index bda9cbe17e..c73eba9c8a 100644
 --- a/paddle/phi/backends/dynload/cublas.h
 +++ b/paddle/phi/backends/dynload/cublas.h
 @@ -49,7 +49,12 @@ extern void *cublas_dso_handle;
@@ -98,107 +98,21 @@ index 8b2e08c777..ca926df151 100644
  #define CUBLASLT_BLAS_ROUTINE_EACH(__macro)      \
    __macro(cublasLtCreate);                       \
 diff --git a/paddle/phi/backends/dynload/cudnn.h b/paddle/phi/backends/dynload/cudnn.h
-index c0080f0a5e..458ca3e2e8 100644
+index a943bbed9a..af931490e3 100644
 --- a/paddle/phi/backends/dynload/cudnn.h
 +++ b/paddle/phi/backends/dynload/cudnn.h
-@@ -38,7 +38,9 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
+@@ -38,7 +38,10 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
          cudnn_dso_handle = phi::dynload::GetCUDNNDsoHandle();        \
        });                                                            \
        EnforceCUDNNLoaded(#__name);                                   \
 -      static void* p_##__name = dlsym(cudnn_dso_handle, #__name);    \
 +      std::string replaced_name = #__name;                                  \
-+      replaced_name =  replaced_name.replace(0,2,"mc");          \
-+      static void* p_##__name = dlsym(cudnn_dso_handle, replaced_name.c_str());    \
++      replaced_name = replaced_name.replace(0, 2, "mc");                    \
++      static void* p_##__name =                                             \
++          dlsym(cudnn_dso_handle, replaced_name.c_str());                \
        return reinterpret_cast<cudnn_func>(p_##__name)(args...);      \
      }                                                                \
    };                                                                 \
-@@ -49,7 +51,6 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
-  * different cudnn version has different interfaces
-  **/
- #define CUDNN_DNN_ROUTINE_EACH(__macro)                    \
--  __macro(cudnnSetCallback);                               \
-   __macro(cudnnSetTensor4dDescriptor);                     \
-   __macro(cudnnSetTensor4dDescriptorEx);                   \
-   __macro(cudnnSetTensorNdDescriptor);                     \
-@@ -104,6 +105,13 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
-   __macro(cudnnSetDropoutDescriptor);                      \
-   __macro(cudnnRestoreDropoutDescriptor);                  \
-   __macro(cudnnCreateRNNDescriptor);                       \
-+  __macro(cudnnGetRNNParamsSize);                          \
-+  __macro(cudnnGetRNNWorkspaceSize);                       \
-+  __macro(cudnnGetRNNTrainingReserveSize);                 \
-+  __macro(cudnnRNNForwardTraining);                        \
-+  __macro(cudnnRNNBackwardData);                           \
-+  __macro(cudnnRNNBackwardWeights);                        \
-+  __macro(cudnnRNNForwardInference);                       \
-   __macro(cudnnDestroyDropoutDescriptor);                  \
-   __macro(cudnnDestroyRNNDescriptor);                      \
-   __macro(cudnnSetTensorNdDescriptorEx);                   \
-@@ -118,7 +126,8 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
-   __macro(cudnnCreateActivationDescriptor);                \
-   __macro(cudnnSetActivationDescriptor);                   \
-   __macro(cudnnGetActivationDescriptor);                   \
--  __macro(cudnnDestroyActivationDescriptor);
-+  __macro(cudnnDestroyActivationDescriptor);               \
-+  __macro(cudnnSetRNNDescriptor_v6);
- CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
- 
- #if CUDNN_VERSION >= 7000 && CUDNN_VERSION < 8000
-@@ -152,7 +161,12 @@ CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
- #define CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(__macro) \
-   __macro(cudnnCreateRNNDataDescriptor);             \
-   __macro(cudnnDestroyRNNDataDescriptor);            \
--  __macro(cudnnSetRNNDataDescriptor);
-+  __macro(cudnnSetRNNDataDescriptor);                \
-+  __macro(cudnnSetRNNPaddingMode);                   \
-+  __macro(cudnnRNNForwardTrainingEx);                \
-+  __macro(cudnnRNNBackwardDataEx);                   \
-+  __macro(cudnnRNNBackwardWeightsEx);                \
-+  __macro(cudnnRNNForwardInferenceEx);
- CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
- #endif
- 
-@@ -195,40 +209,6 @@ CUDNN_DNN_ROUTINE_EACH_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
- CUDNN_DNN_ROUTINE_EACH_FRONTEND(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
- #endif
- 
--#if CUDNN_VERSION < 90000
--#define CUDNN_DNN_ROUTINE_EACH_REMOVED_IN_E9(__macro) \
--  __macro(cudnnGetRNNParamsSize);                     \
--  __macro(cudnnGetRNNWorkspaceSize);                  \
--  __macro(cudnnGetRNNTrainingReserveSize);            \
--  __macro(cudnnSetRNNDescriptor_v6);                  \
--  __macro(cudnnRNNForwardInference);                  \
--  __macro(cudnnRNNForwardTraining);                   \
--  __macro(cudnnRNNBackwardData);                      \
--  __macro(cudnnRNNBackwardWeights);
--CUDNN_DNN_ROUTINE_EACH_REMOVED_IN_E9(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
--#endif
--
--#if CUDNN_VERSION < 90000 && CUDNN_VERSION >= 7201
--#define CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7_REMOVED_IN_E9(__macro) \
--  __macro(cudnnSetRNNPaddingMode);                                 \
--  __macro(cudnnRNNForwardInferenceEx);                             \
--  __macro(cudnnRNNForwardTrainingEx);                              \
--  __macro(cudnnRNNBackwardDataEx);                                 \
--  __macro(cudnnRNNBackwardWeightsEx);
--CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7_REMOVED_IN_E9(
--    DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
--#endif
--
--#if CUDNN_VERSION >= 90000
--#define CUDNN_DNN_ROUTINE_EACH_R9(__macro) \
--  __macro(cudnnGetLastErrorString);        \
--  __macro(cudnnGetRNNWeightSpaceSize);     \
--  __macro(cudnnGetRNNTempSpaceSizes);      \
--  __macro(cudnnRNNForward);                \
--  __macro(cudnnRNNBackwardData_v8);        \
--  __macro(cudnnRNNBackwardWeights_v8);
--CUDNN_DNN_ROUTINE_EACH_R9(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
--#endif
- }  // namespace dynload
- }  // namespace phi
- 
 diff --git a/paddle/phi/backends/dynload/cufft.h b/paddle/phi/backends/dynload/cufft.h
 index 1547909d92..ef20838434 100644
 --- a/paddle/phi/backends/dynload/cufft.h
@@ -247,7 +161,7 @@ index 59e92955c9..d2f8c2da15 100644
 +#endif  // PADDLE_WITH_CUPTI
 \ No newline at end of file
 diff --git a/paddle/phi/backends/dynload/cusolver.h b/paddle/phi/backends/dynload/cusolver.h
-index 86651fc8f1..7c9b122a17 100644
+index 57e09bb6e4..87fb5b1797 100644
 --- a/paddle/phi/backends/dynload/cusolver.h
 +++ b/paddle/phi/backends/dynload/cusolver.h
 @@ -34,7 +34,9 @@ extern void *cusolver_dso_handle;
@@ -262,7 +176,7 @@ index 86651fc8f1..7c9b122a17 100644
      }                                                                \
    };                                                                 \
 diff --git a/paddle/phi/backends/dynload/cusparse.h b/paddle/phi/backends/dynload/cusparse.h
-index 8ec3cf2792..6f5460df00 100644
+index e8cb0ac643..e8e7596d44 100644
 --- a/paddle/phi/backends/dynload/cusparse.h
 +++ b/paddle/phi/backends/dynload/cusparse.h
 @@ -34,7 +34,9 @@ extern void *cusparse_dso_handle;
@@ -277,7 +191,7 @@ index 8ec3cf2792..6f5460df00 100644
      }                                                                \
    };                                                                 \
 diff --git a/paddle/phi/backends/dynload/dynamic_loader.cc b/paddle/phi/backends/dynload/dynamic_loader.cc
-index 859f696896..87b5100a1b 100644
+index c74ae9592e..f6dc68917c 100644
 --- a/paddle/phi/backends/dynload/dynamic_loader.cc
 +++ b/paddle/phi/backends/dynload/dynamic_loader.cc
 @@ -18,7 +18,6 @@ limitations under the License. */
@@ -755,7 +669,7 @@ index 4eae698648..5c047723ea 100644
    return block_dim >= kMaxBlockDim ? kMaxBlockDim : lwarpSize;
  }
 diff --git a/paddle/phi/kernels/funcs/math_cuda_utils.h b/paddle/phi/kernels/funcs/math_cuda_utils.h
-index e5361b836e..5ad238df08 100644
+index dff1033db4..0098123818 100644
 --- a/paddle/phi/kernels/funcs/math_cuda_utils.h
 +++ b/paddle/phi/kernels/funcs/math_cuda_utils.h
 @@ -175,12 +175,12 @@ struct KeyValuePair<half> {
diff --git a/backends/metax_gpu/runtime/process_cupti_data.cc b/backends/metax_gpu/runtime/process_cupti_data.cc
index 94caca5d8cb..73b39225ef2 100755
--- a/backends/metax_gpu/runtime/process_cupti_data.cc
+++ b/backends/metax_gpu/runtime/process_cupti_data.cc
@@ -477,57 +477,6 @@ std::vector<ActivityBuffer> Tracer::ConsumeBuffers() {
 
 void Tracer::ReleaseBuffer(uint8_t* buffer) { AlignedFree(buffer); }
 
-// struct ActivityBuffer {
-//   ActivityBuffer(uint8_t* addr, size_t size) : addr(addr), valid_size(size)
-//   {} uint8_t* addr; size_t valid_size;
-// };
-
-// class Tracer {
-//  public:
-//   static Tracer& Instance() {
-//     static Tracer instance;
-//     return instance;
-//   }
-
-//   void AllocateBuffer(uint8_t** buffer, size_t* size) {
-//     constexpr size_t kBufSize = 1 << 23;  // 8 MB
-//     constexpr size_t kBufAlign = 8;       // 8 B
-//     *buffer = reinterpret_cast<uint8_t*>(AlignedMalloc(kBufSize, kBufAlign));
-//     *size = kBufSize;
-//   }
-//   void ProduceBuffer(uint8_t* buffer, size_t valid_size) {
-//     std::lock_guard<std::mutex> guard(activity_buffer_lock_);
-//     activity_buffers_.emplace_back(buffer, valid_size);
-//   }
-//   std::vector<ActivityBuffer> ConsumeBuffers();
-//   void ReleaseBuffer(uint8_t* buffer);
-
-//  private:
-//   Tracer() {}
-
-//   std::mutex activity_buffer_lock_;
-//   std::vector<ActivityBuffer> activity_buffers_;
-// };
-
-// class Tracer {
-//  public:
-//   static Tracer& Instance() {
-//     static Tracer instance;
-//     return instance;
-//   }
-
-//   void AllocateBuffer(uint8_t** buffer, size_t* size);
-//   void ProduceBuffer(uint8_t* buffer, size_t valid_size);
-//   std::vector<ActivityBuffer> ConsumeBuffers();
-//   void ReleaseBuffer(uint8_t* buffer);
-
-//  private:
-//   Tracer() {}
-
-//   std::mutex activity_buffer_lock_;
-//   std::vector<ActivityBuffer> activity_buffers_;
-// };
-
 const char* MemoryKind(uint16_t kind) {
   switch (kind) {
     case CUPTI_ACTIVITY_MEMORY_KIND_UNKNOWN:
@@ -579,35 +528,3 @@ std::unordered_map<uint32_t, uint64_t> CreateThreadIdMapping() {
   return mapping;
 }
 }  // namespace details
-
-// void Tracer::ReleaseBuffer(void* buffer) { AlignedFree(buffer); }
-
-// int ProcessCuptiActivity(C_Profiler prof, uint64_t tracing_start_ns_) {
-//   int record_cnt = 0;
-//   CUPTI_CALL(cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED));
-//   auto mapping = details::CreateThreadIdMapping();
-//   std::vector<ActivityBuffer> buffers = Tracer::Instance().ConsumeBuffers();
-//   for (auto& buffer : buffers) {
-//     if (buffer.addr == nullptr || buffer.valid_size == 0) {
-//       continue;
-//     }
-//     CUpti_Activity* record = nullptr;
-//     while (true) {
-//       CUptiResult status =
-//           cuptiActivityGetNextRecord(buffer.addr, buffer.valid_size,
-//           &record);
-//       if (status == CUPTI_SUCCESS) {
-//         ProcessCuptiActivityRecord(record, tracing_start_ns_, mapping, prof);
-//         ++record_cnt;
-//       } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
-//         break;
-//       } else {
-//         CUPTI_CALL(status);
-//       }
-//     }
-
-//     Tracer::Instance().ReleaseBuffer(buffer.addr);
-//     // ReleaseBuffer(buffer.addr);
-//   }
-//   return record_cnt;
-// }
diff --git a/backends/metax_gpu/tests/run_test.sh b/backends/metax_gpu/tests/run_test.sh
index 7f2277fe4fb..31b175a60bc 100755
--- a/backends/metax_gpu/tests/run_test.sh
+++ b/backends/metax_gpu/tests/run_test.sh
@@ -21,8 +21,9 @@ LEGACY_TEST_PATH="${SCRIPT_DIR}/../../../Paddle/test/legacy_test"
 TEST_PATH1="${SCRIPT_DIR}/../../../python"
 TEST_PATH2="${SCRIPT_DIR}/../../../python/tests"
 export PYTHONPATH="${LEGACY_TEST_PATH}:${PYTHONPATH}:${TEST_PATH1}:${TEST_PATH2}"
-
-export
+export PADDLE_XCCL_BACKEND=metax_gpu
+export CUDA_VISIBLE_DEVICES=0
+# export
 # sleep 1000000
 
 
From 1dc2b978edfcb324f1637b4e1a41bf21a74ed092 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Thu, 6 Nov 2025 23:42:39 +0800
Subject: [PATCH 093/121] [Metax] add Upload (#145)

* [Metax] add Upload
---
 .github/workflows/metax_work.yaml         | 2 +-
 .github/workflows/metax_work_private.yaml | 2 +-
 backends/metax_gpu/build_private_CI.sh    | 2 +-
 backends/metax_gpu/tests/default.txt      | 1 -
 4 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/metax_work.yaml b/.github/workflows/metax_work.yaml
index a999a9ddb5d..486236955ad 100644
--- a/.github/workflows/metax_work.yaml
+++ b/.github/workflows/metax_work.yaml
@@ -78,7 +78,7 @@ jobs:
 
         run: |
           cd backends/metax_gpu/tests
-          bash run_test.sh -j 16
+          bash run_test.sh -j 8
 
       - name: push whl
         env:
diff --git a/.github/workflows/metax_work_private.yaml b/.github/workflows/metax_work_private.yaml
index afe6fd5c30d..0ead1afee46 100644
--- a/.github/workflows/metax_work_private.yaml
+++ b/.github/workflows/metax_work_private.yaml
@@ -78,7 +78,7 @@ jobs:
 
         run: |
           cd backends/metax_gpu/tests
-          bash run_test.sh -j 16
+          bash run_test.sh -j 8
 
       - name: push whl
         env:
diff --git a/backends/metax_gpu/build_private_CI.sh b/backends/metax_gpu/build_private_CI.sh
index edbb326e081..e464bf768fe 100644
--- a/backends/metax_gpu/build_private_CI.sh
+++ b/backends/metax_gpu/build_private_CI.sh
@@ -87,5 +87,5 @@ echo "Done!"
 cd build/dist/
 ossutil ls oss://opensource-ci/paddle/
 ossutil cat oss://opensource-ci/paddle/test1
-ossutil cp ./paddle_metax_gpu-*.whl oss://opensource-ci/paddle/test1
+ossutil cp ./paddle_metax_gpu-*.whl oss://opensource-ci/paddle/test1/
 cd -
diff --git a/backends/metax_gpu/tests/default.txt b/backends/metax_gpu/tests/default.txt
index 54f0b7c008f..ccedd44ced0 100644
--- a/backends/metax_gpu/tests/default.txt
+++ b/backends/metax_gpu/tests/default.txt
@@ -164,7 +164,6 @@ test_empty_op
 test_functional_conv1d_transpose
 test_clip_by_norm_op
 test_box_clip_op
-test_clip_op
 test_grad_clip_minimize
 test_less_than_op
 test_adamw_op

From 93ae313648e2040e81b112923d9cfabfb310f6f9 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Fri, 7 Nov 2025 19:04:47 +0800
Subject: [PATCH 094/121] test (#154)

---
 .github/workflows/metax_work_private.yaml | 2 ++
 Paddle                                    | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/metax_work_private.yaml b/.github/workflows/metax_work_private.yaml
index 0ead1afee46..b4341fa4506 100644
--- a/.github/workflows/metax_work_private.yaml
+++ b/.github/workflows/metax_work_private.yaml
@@ -5,6 +5,8 @@ on:
   pull_request:
     types: [opened, synchronize]
     branches: [develop, release/**]
+    schedule:
+      - cron: "0 15 * * *"
 permissions: read-all
 
 defaults:
diff --git a/Paddle b/Paddle
index 2b9ba85d9c5..b009972297d 160000
--- a/Paddle
+++ b/Paddle
@@ -1 +1 @@
-Subproject commit 2b9ba85d9c512c05e20b38ea822dc808e410609f
+Subproject commit b009972297d9423ccbdb5ddb6d75cb8db9080e25

From d87a65404cc5a9ef215c20a0ea562399344b567a Mon Sep 17 00:00:00 2001
From: ZhouDuan <136539532+1184319564@users.noreply.github.com>
Date: Fri, 7 Nov 2025 19:22:19 +0800
Subject: [PATCH 095/121] ReRun CI (#150)

---
 backends/metax_gpu/build_private_CI.sh        |   2 +-
 .../cuda_kernels/uniform_kernel_register.cu   |   7 +-
 backends/metax_gpu/tests/default.txt          |  57 ++
 .../metax_gpu/tests/scripts/log_analysis.py   |   4 +-
 .../tests/unit_test/test_einsum_metax.py      | 566 ++++++++++++++++++
 5 files changed, 631 insertions(+), 5 deletions(-)
 create mode 100644 backends/metax_gpu/tests/unit_test/test_einsum_metax.py

diff --git a/backends/metax_gpu/build_private_CI.sh b/backends/metax_gpu/build_private_CI.sh
index e464bf768fe..113bb14a681 100644
--- a/backends/metax_gpu/build_private_CI.sh
+++ b/backends/metax_gpu/build_private_CI.sh
@@ -49,7 +49,7 @@ echo "✅ 脚本执行完毕！"
 echo "📌 已撤销本地修改，并更新到 Paddle 最新的 develop (dev) 分支代码。"
 
 
-pip install safetensors==0.6.2 -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple some-package
+pip install parameterized safetensors==0.6.2 -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple some-package
 # install paddle
 
 python -m pip install --pre paddlepaddle -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/
diff --git a/backends/metax_gpu/kernels/cuda_kernels/uniform_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/uniform_kernel_register.cu
index 55fa64fa63e..1541c855404 100755
--- a/backends/metax_gpu/kernels/cuda_kernels/uniform_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/uniform_kernel_register.cu
@@ -21,5 +21,8 @@ PD_CUSTOM_KERNEL_REGISTER(uniform,
                           phi::UniformKernel,
                           float,
                           double,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::float8_e4m3fn,
+                          phi::complex64,
+                          phi::complex128) {}
diff --git a/backends/metax_gpu/tests/default.txt b/backends/metax_gpu/tests/default.txt
index ccedd44ced0..16c5531c47b 100644
--- a/backends/metax_gpu/tests/default.txt
+++ b/backends/metax_gpu/tests/default.txt
@@ -310,3 +310,60 @@ test_dygraph_spectral_norm
 test_block_diag
 test_index_elementwise
 test_matmul_out
+test_arg_min_max_op
+test_as_strided
+test_cartesian_prod
+test_coalesce_tensor_op
+test_conv2d_op
+test_conv2d_op_depthwise_conv
+test_cross_entropy_loss
+test_cross_op
+test_cumprod_op_dtype
+test_einsum_v2
+test_elementwise_max_op
+test_elementwise_min_op
+test_elementwise_tensor_split
+test_embedding_deterministic
+test_empty
+test_eye
+test_eye_op
+test_full
+test_fused_multihead_matmul_op
+test_fused_rotary_position_embedding
+test_gaussian_random_op
+test_grid_sample_function
+test_hapi_amp
+test_histogram_op
+test_imperative_ptb_rnn
+test_incubate_cal_aux_loss
+test_incubate_cross_entropy_with_softmax_bwd_w_downcast
+test_incubate_embedding_grad
+test_incubate_int_bincount
+test_incubate_moe_combine_no_weight
+test_index_fill
+test_index_put_op
+test_index_select_strided
+test_instance_norm_op
+test_kldiv_loss_op
+test_label_smooth_op
+test_linear
+test_margin_cross_entropy_op
+test_matmul_0_size_op
+test_matmul_int8_op
+test_meshgrid_op
+test_min_op
+test_momentum_op
+test_multinomial_op
+test_nn_functional_embedding_dygraph
+test_nonzero_api
+test_pow
+test_pow_op
+test_prod_op
+test_randperm_op
+test_rnn_decode_api
+test_rnn_op
+test_round_op
+test_spectral_norm_op
+test_take_along_axis_op
+test_tensor_unfold
+test_warpctc_op
diff --git a/backends/metax_gpu/tests/scripts/log_analysis.py b/backends/metax_gpu/tests/scripts/log_analysis.py
index 963d50751f7..3300c7bc082 100644
--- a/backends/metax_gpu/tests/scripts/log_analysis.py
+++ b/backends/metax_gpu/tests/scripts/log_analysis.py
@@ -206,10 +206,10 @@ def run(self):
 
     analyzer = LogAnalyzer(
         classify_file="./classify.json",
-        search_path="./NPU_logs/20250918_065326",
+        search_path="./High_op/logs_output-20251106",
         pattern="test_*.log",
     )
 
     analyzer.run()
     analyzer.show_result()
-    analyzer.save_result("./output")
+    analyzer.save_result("./High_op/logs_output-20251106-result")
diff --git a/backends/metax_gpu/tests/unit_test/test_einsum_metax.py b/backends/metax_gpu/tests/unit_test/test_einsum_metax.py
new file mode 100644
index 00000000000..c25c85cca84
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_einsum_metax.py
@@ -0,0 +1,566 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import unittest
+
+import numpy as np
+from op_test import get_device_place, is_custom_device
+
+import paddle
+from paddle.base import core
+
+core.set_cublas_switch(False)
+
+os.environ["FLAGS_new_einsum"] = "0"
+
+
+class TestErrors(unittest.TestCase):
+    def setUp(self):
+        pass
+
+    def test_diagonalize_errors(self):
+        a = np.arange(4 * 3 * 4 * 4).reshape(4, 3, 4, 4).astype("float")
+        a = paddle.to_tensor(a)
+        with self.assertRaisesRegex(
+            AssertionError, ("Duplicate labels are not supported.")
+        ):
+            paddle.einsum("...ii->...i", a)
+        with self.assertRaisesRegex(
+            AssertionError, ("Duplicate labels are not supported.")
+        ):
+            paddle.einsum("i...i", a)
+        with self.assertRaisesRegex(
+            AssertionError, ("Duplicate labels are not supported.")
+        ):
+            paddle.einsum("i...i->i...", a)
+
+    def test_param_errors(self):
+        a = np.arange(4 * 3 * 4 * 4).reshape(4, 3, 4, 4).astype("float")
+        a = paddle.to_tensor(a)
+        with self.assertRaisesRegex(
+            AssertionError, ("At least one operand is expected.")
+        ):
+            paddle.einsum("ijk")
+        with self.assertRaisesRegex(
+            AssertionError, ("Invalid equation: multiple `->` were found.")
+        ):
+            paddle.einsum("i -> j -> k", a)
+        with self.assertRaisesRegex(
+            AssertionError,
+            (
+                "Invalid equation: the number of operands is 2, "
+                "but found 3 segments in the label equation."
+            ),
+        ):
+            paddle.einsum("i,j,k", a, a)
+        with self.assertRaisesRegex(
+            AssertionError,
+            (
+                "Invalid equation: the number of operands is 2, "
+                "but found 1 segments in the label equation."
+            ),
+        ):
+            paddle.einsum("ij -> k", a, a)
+        with self.assertRaisesRegex(
+            AssertionError,
+            (
+                "Invalid equation: the number of operands is 1, "
+                "but found 2 segments in the label equation."
+            ),
+        ):
+            paddle.einsum("i, -> k", a)
+        with self.assertRaisesRegex(
+            AssertionError,
+            ("Invalid equation: the label string '' misses dimensions."),
+        ):
+            paddle.einsum("->", a)
+        with self.assertRaisesRegex(
+            AssertionError,
+            ("Invalid equation: the label string 'i' misses dimensions."),
+        ):
+            paddle.einsum("i", a)
+        with self.assertRaisesRegex(
+            AssertionError,
+            ("Invalid equation: _ is not a valid label, " "which should be letters."),
+        ):
+            paddle.einsum("i_", a)
+        with self.assertRaisesRegex(
+            AssertionError,
+            ("Invalid equation: `.` is found outside of an ellipsis."),
+        ):
+            paddle.einsum("i..j", a)
+        with self.assertRaisesRegex(
+            AssertionError,
+            ("Invalid equation: `.` is found outside of an ellipsis."),
+        ):
+            paddle.einsum("...k...", a)
+        with self.assertRaisesRegex(
+            AssertionError,
+            ("Invalid equation: missing ellipsis in output labels."),
+        ):
+            paddle.einsum("i...->i", a)
+        with self.assertRaisesRegex(
+            AssertionError,
+            ("Invalid equation: duplicate output labels are found."),
+        ):
+            paddle.einsum("i...->i...i", a)
+        with self.assertRaisesRegex(
+            AssertionError,
+            (
+                "Invalid operands: label i "
+                "corresponds to non-broadcastable dimensions."
+            ),
+        ):
+            paddle.einsum("ij...,ji...", a, a)
+
+
+class TestEinsum(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        np.random.seed(12345)
+
+        cls.TEST_SAMPLES = {
+            "a": np.random.rand(1, 1),
+            "b": np.random.rand(1),
+            "x": np.random.rand(5),
+            "y": np.random.rand(7),
+            "A": np.random.rand(4, 5),
+            "B": np.random.rand(2, 5),
+            "C": np.random.rand(3, 7),
+            "D": np.random.rand(3, 4, 5),
+            "E": np.random.rand(3, 5, 2),
+            "F": np.random.rand(2, 4, 5, 3),
+            "G": np.random.rand(4, 2, 5),
+            "H": np.random.rand(3, 2, 4),
+            "I": np.random.rand(2, 2),
+            "J": np.random.rand(1, 3, 5),
+            "K": np.random.rand(1, 2, 3, 4),
+            "L": np.random.rand(2, 0, 13),
+            "M": np.random.rand(13),
+        }
+
+    def _get_place(self, force_to_use_cpu=False):
+        if force_to_use_cpu:
+            return core.CPUPlace()
+        else:
+            if core.is_compiled_with_cuda() or is_custom_device():
+                return get_device_place()
+            return core.CPUPlace()
+
+    def check_output_equal(self, actual, expect, rtol=1.0e-5, atol=1.0e-8):
+        error_msg = (
+            "Output has diff at place:{}. \nExpect: {} \nBut Got: {} in class {}"
+        )
+        np.testing.assert_allclose(
+            actual,
+            expect,
+            rtol=rtol,
+            atol=atol,
+            err_msg=error_msg.format(
+                paddle.get_device(), expect, actual, self.__class__.__name__
+            ),
+        )
+
+    def setUp(self):
+        self.sample = {"paradigm": "i->", "data": ["x"]}
+
+    def test_forward(self):
+        operands = [TestEinsum.TEST_SAMPLES[operand] for operand in self.sample["data"]]
+        expected_result = np.einsum(self.sample["paradigm"], *operands)
+        equation = self.sample["paradigm"]
+
+        with paddle.base.dygraph.guard(self._get_place(force_to_use_cpu=False)):
+            pd_operands = [paddle.to_tensor(operand) for operand in operands]
+            result = paddle.einsum(equation, *pd_operands)
+            self.check_output_equal(result.numpy(), expected_result)
+
+        with paddle.base.dygraph.guard(self._get_place(force_to_use_cpu=True)):
+            pd_operands = [paddle.to_tensor(operand) for operand in operands]
+            result = paddle.einsum(equation, *pd_operands)
+            self.check_output_equal(result.numpy(), expected_result)
+
+
+class TestEinsumVectorDot(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "i,i->", "data": ["x", "x"]}
+
+
+class TestEinsumVectorMul(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "i,i->i", "data": ["x", "x"]}
+
+
+class TestEinsumVectorOuter(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "i,j->ij", "data": ["x", "y"]}
+
+
+class TestEinsumMatrixTranspose(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "ij->ji", "data": ["A"]}
+
+
+class TestEinsumMatrixRowSum(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "ij->j", "data": ["A"]}
+
+
+class TestEinsumMatrixColSum(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "ij->i", "data": ["A"]}
+
+
+class TestEinsumMatrixEleMul(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "ij,ij->ij", "data": ["A", "A"]}
+
+
+class TestEinsumDegenerateMatrixVecMul(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "ij,j", "data": ["a", "b"]}
+
+
+class TestEinsumMatrixVecMul(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "ij,j->i", "data": ["A", "x"]}
+
+
+class TestEinsumMatrixMul(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "ij,kj->ik", "data": ["A", "B"]}
+
+
+class TestEinsumMatrixOuter(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "ij,kl->ijkl", "data": ["A", "C"]}
+
+
+class TestEinsumTensorBMM(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "bij,bjk->bik", "data": ["D", "E"]}
+
+
+class TestEinsumTensorContract1(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "ijk,jk->i", "data": ["D", "A"]}
+
+
+class TestEinsumTensorContract2(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "ijk,lk->ijl", "data": ["D", "B"]}
+
+
+class TestEinsumTensorContract3(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "abcd,dfg->abcfg", "data": ["F", "D"]}
+
+
+class TestEinsumTensorContract4(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "ijk,jk->ik", "data": ["D", "A"]}
+
+
+class TestEinsumTensorContract5(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "ijk,jk->ij", "data": ["D", "A"]}
+
+
+class TestEinsumTensorContract6(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "ik, ijk->j", "data": ["A", "G"]}
+
+
+class TestEinsumTensorContract7(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "ijk, ik->jk", "data": ["G", "A"]}
+
+
+class TestEinsumEllipsis1(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "i...->...", "data": ["G"]}
+
+
+class TestEinsumEllipsis2(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "ij,...i->j...", "data": ["A", "H"]}
+
+
+class TestEinsumEllipsis3(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "k...,jk", "data": ["F", "I"]}
+
+
+class TestEinsumTestEinsumBilinear(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "bn,anm,bm->ba", "data": ["B", "E", "I"]}
+
+
+class TestEinsumTestEinsumOthers1(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "ijkl, lmn->kmn", "data": ["F", "H"]}
+
+
+class TestEinsumTestEinsumOthers2(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "ijkl, lmn->ijn", "data": ["F", "H"]}
+
+
+class TestEinsumBatch1(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "blq,bhlk->bhlqk", "data": ["J", "K"]}
+
+
+class TestEinsumZeroSizeTensor(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "...i, ...i", "data": ["L", "M"]}
+
+    def test_backward(self):
+        operands = [TestEinsum.TEST_SAMPLES[operand] for operand in self.sample["data"]]
+        expected_result = np.einsum(self.sample["paradigm"], *operands)
+        equation = self.sample["paradigm"]
+
+        with paddle.base.dygraph.guard(self._get_place(force_to_use_cpu=False)):
+            pd_operands = [
+                paddle.to_tensor(operand, stop_gradient=False) for operand in operands
+            ]
+            result = paddle.einsum(equation, *pd_operands)
+            self.check_output_equal(result.numpy(), expected_result)
+            loss = result.sum()
+            loss.backward()
+            for x in pd_operands:
+                np.testing.assert_allclose(x.grad.shape, x.shape)
+
+        with paddle.base.dygraph.guard(self._get_place(force_to_use_cpu=True)):
+            pd_operands = [
+                paddle.to_tensor(operand, stop_gradient=False) for operand in operands
+            ]
+            result = paddle.einsum(equation, *pd_operands)
+            self.check_output_equal(result.numpy(), expected_result)
+            loss = result.sum()
+            loss.backward()
+            for x in pd_operands:
+                np.testing.assert_allclose(x.grad.shape, x.shape)
+
+
+class TestNumpyTests(unittest.TestCase):
+    def setUp(self):
+        pass
+
+    def _get_place(self, force_to_use_cpu=False):
+        if force_to_use_cpu:
+            return core.CPUPlace()
+        else:
+            if core.is_compiled_with_cuda() or is_custom_device():
+                return get_device_place()
+            return core.CPUPlace()
+
+    def check_output_equal(self, actual, expect, rtol=1.0e-5, atol=1.0e-8):
+        error_msg = (
+            "Output has diff at place:{}. \nExpect: {} \nBut Got: {} in class {}"
+        )
+        np.testing.assert_allclose(
+            actual,
+            expect,
+            rtol=rtol,
+            atol=atol,
+            err_msg=error_msg.format(
+                paddle.get_device(), expect, actual, self.__class__.__name__
+            ),
+        )
+
+    def check_output(self, eqn, *ops):
+        expect = np.einsum(eqn, *ops)
+        with paddle.base.dygraph.guard(self._get_place(force_to_use_cpu=False)):
+            pd_operands = [paddle.to_tensor(op) for op in ops]
+            actual = paddle.einsum(eqn, *pd_operands)
+            self.check_output_equal(actual.numpy(), expect)
+
+    def test_sums(self):
+        for n in range(1, 17):
+            a = np.arange(n).astype("float")
+            self.check_output("i->", a)
+
+        for n in range(1, 17):
+            a = np.arange(2 * 3 * n).reshape(2, 3, n).astype("float")
+            self.check_output("...i->...", a)
+
+        for n in range(1, 17):
+            a = np.arange(2 * n).reshape(2, n).astype("float")
+            self.check_output("i...->...", a)
+
+        for n in range(1, 17):
+            a = np.arange(2 * 3 * n).reshape(2, 3, n).astype("float")
+            self.check_output("i...->...", a)
+
+        for n in range(1, 17):
+            a = np.arange(3 * n).reshape(3, n).astype("float")
+            b = np.arange(2 * 3 * n).reshape(2, 3, n).astype("float")
+            self.check_output("..., ...", a, b)
+
+        for n in range(1, 17):
+            a = np.arange(2 * 3 * n).reshape(2, 3, n).astype("float")
+            b = np.arange(n).astype("float")
+            self.check_output("...i, ...i", a, b)
+
+        for n in range(1, 11):
+            a = np.arange(n * 3 * 2).reshape(n, 3, 2).astype("float")
+            b = np.arange(n).astype("float")
+            self.check_output("i..., i...", a, b)
+
+        for n in range(1, 17):
+            a = (np.arange(3) + 1).astype("float")
+            b = (np.arange(n) + 1).astype("float")
+            self.check_output("i,j", a, b)
+
+        for n in range(1, 17):
+            a = np.arange(4 * n).reshape(4, n).astype("float")
+            b = np.arange(n).astype("float")
+            self.check_output("ij, j", a, b)
+
+        for n in range(1, 17):
+            a = np.arange(4 * n).reshape(4, n).astype("float")
+            b = np.arange(n).astype("float")
+            self.check_output("ji,j", a.T, b.T)
+
+        for n in range(1, 17):
+            a = np.arange(4 * n).reshape(4, n).astype("float")
+            b = np.arange(n * 6).reshape(n, 6).astype("float")
+            self.check_output("ij,jk", a, b)
+
+        a = np.arange(12).reshape(3, 4).astype("float")
+        b = np.arange(20).reshape(4, 5).astype("float")
+        c = np.arange(30).reshape(5, 6).astype("float")
+        self.check_output("ij,jk,kl", a, b, c)
+
+        a = np.arange(60).reshape(3, 4, 5).astype("float")
+        b = np.arange(24).reshape(4, 3, 2).astype("float")
+        self.check_output("ijk, jil -> kl", a, b)
+
+        for n in range(1, 25):
+            a = np.arange(n).astype("float")
+            self.check_output("...,...", a, a)
+            self.check_output("i,i", a, a)
+
+        p = np.ones((10, 2)).astype("float")
+        q = np.ones((1, 2)).astype("float")
+        self.check_output("ij,ij->j", p, q)
+
+        x = np.array([2.0, 3.0]).astype("float")
+        y = np.array([4.0]).astype("float")
+        self.check_output("i, i", x, y)
+
+        p = np.ones((1, 5)) / 2
+        q = np.ones((5, 5)) / 2
+        self.check_output("...ij,...jk->...ik", p, p.T)
+        self.check_output("...ij,...jk->...ik", p, q)
+
+        x = np.eye(2).astype("float")
+        y = np.ones(2).astype("float")
+        self.check_output("ji,i->", x, y)
+        self.check_output("i,ij->", y, x)
+        self.check_output("ij,i->", x, y)
+
+    def test_large_nops(self):
+        a = np.arange(4 * 3 * 1 * 4).reshape(4, 3, 1, 4).astype("float")
+        self.check_output("a...b,b...c,c...d", a, a, a)
+        self.check_output("a...b,b...c,c...a", a, a, a)
+        self.check_output("a...b,b...c,c...a", a, a, a)
+        self.check_output("...ab,...ba,...ab,...ab", a, a, a, a)
+
+    def test_static_graph(self):
+        paddle.enable_static()
+        base = paddle.base
+        if base.core.is_compiled_with_cuda() or is_custom_device():
+            self.place = get_device_place()
+        else:
+            self.place = base.CPUPlace()
+        main = base.Program()
+        startup = base.Program()
+        with base.program_guard(main, startup):
+            a = paddle.static.data(name="a", shape=[3, None, None, None], dtype="float")
+            b = paddle.static.data(name="b", shape=[2, None, None, None], dtype="float")
+            c = paddle.static.data(name="c", shape=[None, None, 2, None], dtype="float")
+            d = paddle.static.data(name="d", shape=[None, None, 5], dtype="float")
+            e = paddle.static.data(name="e", shape=[None, 2, None], dtype="float")
+
+            outs = []
+            outs.append(paddle.einsum("ibnd,jbnd->bnij", a, b))
+            outs.append(paddle.einsum("...ik, ...j", c, d))
+            outs.append(paddle.einsum("...kj, ...ik", d, e))
+            outs.append(paddle.einsum("ijk..., ikj", c, e))
+            outs.append(paddle.einsum("ijk..., ikj->...ij", c, e))
+        exe = base.Executor(self.place)
+        exe.run(startup)
+        a = np.arange(72).reshape(3, 2, 3, 4).astype("float")
+        b = np.arange(48).reshape(2, 2, 3, 4).astype("float")
+        c = np.arange(48).reshape(2, 3, 2, 4).astype("float")
+        d = np.arange(30).reshape(2, 3, 5).astype("float")
+        e = np.arange(12).reshape(2, 2, 3).astype("float")
+        feeds = {"a": a, "b": b, "c": c, "d": d, "e": e}
+        actual = exe.run(main, feed=feeds, fetch_list=[outs])
+        expect = []
+        expect.append(np.einsum("ibnd,jbnd->bnij", a, b))
+        expect.append(np.einsum("...ik, ...j", c, d))
+        expect.append(np.einsum("...kj, ...ik", d, e))
+        expect.append(np.einsum("ijk..., ikj", c, e))
+        expect.append(np.einsum("ijk..., ikj->...ij", c, e))
+        for a, e in zip(actual, expect):
+            self.check_output_equal(a, e)
+
+
+class TestContractionBroadcastGrad(unittest.TestCase):
+    def setUp(self):
+        self.place = (
+            get_device_place()
+            if (paddle.is_compiled_with_cuda() or is_custom_device())
+            else paddle.CPUPlace()
+        )
+
+    def test_case1(self):
+        with paddle.base.dygraph.guard(self.place):
+            # paddle.einsum("i, i", Tensor([2],"float32"), Tensor([1],"float32"), )
+            x_np = np.array([0.1, 0.2]).astype(np.float32)
+            y_np = np.array([0.5]).astype(np.float32)
+            except_res = np.einsum("i, i", x_np, y_np)
+            except_grad_x = np.array([0.5, 0.5]).astype(np.float32)
+            except_grad_y = np.array([0.3]).astype(np.float32)
+            x = paddle.to_tensor(x_np, stop_gradient=False)
+            y = paddle.to_tensor(y_np, stop_gradient=False)
+            res = paddle.einsum("i, i", x, y)
+            np.testing.assert_allclose(res.numpy(), except_res)
+            res.sum().backward()
+            x.grad.get_tensor()  # To check if accessing unallocated memory
+            np.testing.assert_allclose(x.grad.numpy(), except_grad_x)
+            np.testing.assert_allclose(y.grad.numpy(), except_grad_y)
+
+    def test_case2(self):
+        with paddle.base.dygraph.guard(self.place):
+            # paddle.einsum("ij,ij->j", Tensor([2, 2],"float32"), Tensor([1, 2],"float32"), )
+            x_np = np.array([[0.1, 0.2], [0.3, 0.4]]).astype(np.float32)
+            y_np = np.array([[0.5, 0.6]]).astype(np.float32)
+            except_res = np.einsum("ij,ij->j", x_np, y_np)
+            except_grad_x = np.array([[0.5, 0.6], [0.5, 0.6]]).astype(np.float32)
+            except_grad_y = np.array([[0.4, 0.6]]).astype(np.float32)
+            x = paddle.to_tensor(x_np, stop_gradient=False)
+            y = paddle.to_tensor(y_np, stop_gradient=False)
+            res = paddle.einsum("ij,ij->j", x, y)
+            np.testing.assert_allclose(res.numpy(), except_res)
+            res.sum().backward()
+            x.grad.get_tensor()  # To check if accessing unallocated memory
+            np.testing.assert_allclose(x.grad.numpy(), except_grad_x)
+            np.testing.assert_allclose(y.grad.numpy(), except_grad_y)
+
+
+if __name__ == "__main__":
+    unittest.main()

From dbde9e02340f0b65aab89229c85c8925ca4b6e6b Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Mon, 10 Nov 2025 11:32:10 +0800
Subject: [PATCH 096/121] [metax]fix collect_fpn_proposals (#157)

* [Metax_change_ut]

* fix sum&collect_fpn_proposals op register

* modify profile

* [Metax] fix paddle bug replace 'MoeGradDispatchKernel' to 'MoeGateDispatchKernel'

* [Metax] register bce_loss_grad & bce_loss & index_add_grad kernels

* [Metax] con2d_grad use gpudnn

* blas handle support

* [Metax] register some kernels & update CMakeLists

* [Metax] fix metax unittest fail

* [Metax] add group_norm & label_smooth kernel and update matmul kernel

* [Metax] fix rmsprop kernel register and add meshgrid & meshgrid_grad kernel register

* add test

* add test

* [test]  chang the logic of workspace_host in cholesky_kernel_register

alloc(cpuplace,size), test pass
alloc(cpuplace, size, stream), crash

* [Metax] fix compile fail

* Revert "[Metax] fix compile fail"

This reverts commit 83bc87f686227962b0262e044225c6ed5507b824.

* [Metax] fix compile fail by 'conv_transpose_grad_kernel_impl.h'

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] con2d_grad use gpudnn

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] change_patch

* [Metax] update unit test CMakeLists.txt

* [Metax] update unit test CMakeLists.txt

* [feature] add unique_consecutive kernel

* [metax] add some kernel

* [metax] add some kernel

* [Metax] register baddbmm kernel & update blas api

* [Metax] register baddbmm kernel & update blas api

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [feature]  add add unique_consecutive kernel.cu

* [fix] fix some test case due to missing op register

* [fix]  fix some fail text

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax]fix lu eigvalshsqueeze rnn kernel

* add and fix some kernels

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [Metax] fix conflict

* [Metax] adapt to paddle-cpu-20250901 & resolve the issue of 'test_elementwise_mul_op_metax' failure

* [Metax] update repeat_interleave kernel & ignore max op test

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax] chang patch fix copy

* [metax] chang patch fix copy

* [Metax] update metax_gpu unit test

* [Metax] fix test CMakeList.txt

* [metax]change_cupti_and_fix_softmax

* [metax]change_patch

* [metax]change_patch

* [metax] updata_qr_kernel

* [metax] updata_qr_kernel

* [Metax] fix cufft and fix some blas kernel apply

* [metax] fix bug

* [Metax] add github action

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]fix_code style and index_elementwise_put_kernel

* [metax]change_build

* [metax]change_build

* change_metax_work

* change_metax_work

* change_metax_work

* change_metax_work

* change_metax_work

* change_warpctc.cmake

* change warpctc.cmake

* test

* change_run_ut

* remove_tets

* test

* add_generate_pb

* [metax]fix paddle bug

* change_ut

* change_ut

* change_ut

* [metax]fix patch and fix missing kernel

* [metax] link mccl and fix missing kernel

* [metax] rename yaml file

* [metax] rm file

* [metax] rm file

* [metax] add Rules

* [metax] change_patch

* update paddle

* [metax] fix dot error

* [metax]rm opt path and fix activation_kernel bug

* updata paddle

* chang_meatx_yaml

* chang_meatx_yaml

* updata_metax

* test

* test

* test

* test

* test

* test

* test

* test

* test

* test

* test

* test

* updata_enigen

* updata_paddle

* test

* updata ignore

* updata_ignore

* updata flag_and_fix_activation

* updataignore

* updata_patch

* feat: add gammaln_grad_kernel.cu

* updata_softmax

* updata_patch

* change_flag

* [metax] add private CI

* [metax] add private CI

* [metax] add private CI

* [Metax] add private CI

* [Metax] add private CI

* [Metax] add private CI

* [Metax] add private CI

* [Metax] add private CI

* [Metax] add private CI

* [Metax] add Upload

* chang yaml

* chang ut

* updata_paddle

* [metax] add schedule

* test

* [metax]fix collect_fpn_proposals

---------

Co-authored-by: Mingkun.Zhang <2496808993@qq.com>
Co-authored-by: metax666 <metax_pde@outlook.com>
Co-authored-by: jiaxinWang-metax <189149612@qq.com>
Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Co-authored-by: chezhang <1376507468@qq.com>
Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com>
Co-authored-by: ZhouDuan <1184319564@qq.com>
Co-authored-by: root <root@lt-wks-10-0-180-15.pub.metax-tech.com>
---
 .../cuda_kernels/collect_fpn_proposals_kernel_register.cu        | 1 +
 1 file changed, 1 insertion(+)

diff --git a/backends/metax_gpu/kernels/cuda_kernels/collect_fpn_proposals_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/collect_fpn_proposals_kernel_register.cu
index d5b1df7e2e2..8b7af1e0dbe 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/collect_fpn_proposals_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/collect_fpn_proposals_kernel_register.cu
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/gpu/collect_fpn_proposals_kernel.h"
 
 PD_CUSTOM_KERNEL_REGISTER(collect_fpn_proposals,

From cadf29c571574ca93011a8e94ee4d6393fed60b1 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Tue, 11 Nov 2025 16:50:47 +0800
Subject: [PATCH 097/121] [Metax]Update version information (#158)

---
 backends/metax_gpu/build_private_CI.sh |  7 ++++---
 backends/metax_gpu/compile.sh          |  4 ++--
 backends/metax_gpu/env.sh              | 22 ++++++++++++++++++++++
 backends/metax_gpu/setup.py.in         |  7 ++++++-
 4 files changed, 34 insertions(+), 6 deletions(-)
 create mode 100644 backends/metax_gpu/env.sh

diff --git a/backends/metax_gpu/build_private_CI.sh b/backends/metax_gpu/build_private_CI.sh
index 113bb14a681..fabaf1ffc5b 100644
--- a/backends/metax_gpu/build_private_CI.sh
+++ b/backends/metax_gpu/build_private_CI.sh
@@ -67,7 +67,7 @@ export CUCC_PATH=${MACA_PATH}/tools/cu-bridge
 export PATH=${PATH}:${CUCC_PATH}/tools:${CUCC_PATH}/bin
 export PATH=${MACA_PATH}/bin:${PATH}
 export LD_LIBRARY_PATH=${MACA_PATH}/lib:${MACA_PATH}/mxgpu_llvm/lib:${LD_LIBRARY_PATH}
-export PADDLE_VERSION=3.3.0
+export PADDLE_VERSION="3.3.0.dev$(date +%Y%m%d)"
 
 if [ ! -d build ]; then
     echo "build directory not found, creating..."
@@ -86,6 +86,7 @@ echo "Done!"
 
 cd build/dist/
 ossutil ls oss://opensource-ci/paddle/
-ossutil cat oss://opensource-ci/paddle/test1
-ossutil cp ./paddle_metax_gpu-*.whl oss://opensource-ci/paddle/test1/
+ossutil cat oss://opensource-ci/paddle/
+
+ossutil cp ./paddle_metax_gpu-*.whl oss://opensource-ci/paddle/
 cd -
diff --git a/backends/metax_gpu/compile.sh b/backends/metax_gpu/compile.sh
index eba45a9ced2..20e888ef4d4 100644
--- a/backends/metax_gpu/compile.sh
+++ b/backends/metax_gpu/compile.sh
@@ -22,7 +22,7 @@ export CUCC_PATH=${MACA_PATH}/tools/cu-bridge
 export PATH=${PATH}:${CUCC_PATH}/tools:${CUCC_PATH}/bin
 export PATH=${MACA_PATH}/bin:${PATH}
 export LD_LIBRARY_PATH=${MACA_PATH}/lib:${MACA_PATH}/mxgpu_llvm/lib:${LD_LIBRARY_PATH}
-
+export PADDLE_VERSION="3.3.0.dev$(date +%Y%m%d)"
 if [ ! -d build ]; then
     echo "build directory not found, creating..."
     mkdir build
@@ -31,7 +31,7 @@ fi
 echo "make_maca"
 cd build
 cmake_maca .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DPython3_EXECUTABLE=$(which python3) -DWITH_GPU=ON
-make_maca -j10
+make_maca -j18
 
 
 echo "install whl"
diff --git a/backends/metax_gpu/env.sh b/backends/metax_gpu/env.sh
new file mode 100644
index 00000000000..1fd07ac5480
--- /dev/null
+++ b/backends/metax_gpu/env.sh
@@ -0,0 +1,22 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+DEFAULT_DIR="/opt/maca"
+export MACA_PATH=${1:$DEFAULT_DIR}
+export CUDA_PATH=/workspace/cuda-11.7/
+export PATH=${CUDA_PATH}/bin:${PATH}
+export CUCC_PATH=${MACA_PATH}/tools/cu-bridge
+export PATH=${PATH}:${CUCC_PATH}/tools:${CUCC_PATH}/bin
+export PATH=${MACA_PATH}/bin:${PATH}
+export LD_LIBRARY_PATH=${MACA_PATH}/lib:${MACA_PATH}/mxgpu_llvm/lib:${LD_LIBRARY_PATH}
diff --git a/backends/metax_gpu/setup.py.in b/backends/metax_gpu/setup.py.in
index 6c8f54c38cf..b1600e9bb5a 100644
--- a/backends/metax_gpu/setup.py.in
+++ b/backends/metax_gpu/setup.py.in
@@ -81,6 +81,11 @@ class BinaryDistribution(Distribution):
     def has_ext_modules(self):
         return True
 
+# maca ai version
+maca_ai_version = os.getenv('MACA_AI_VERSION')
+if not maca_ai_version:
+    maca_ai_version = "0.0.0"
+
 
 def main():
     write_custom_op_api_py()
@@ -89,7 +94,7 @@ def main():
 
     setup(
     name = '@CMAKE_PROJECT_NAME@',
-    version='@PLUGIN_VERSION@',
+    version='@PLUGIN_VERSION@' + "+maca" + maca_ai_version,
     description='Paddle metax_gpu plugin',
     long_description='',
     long_description_content_type="text/markdown",

From 5e4118c2940c797ee857c1ef49386b1f7dc22ddb Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Tue, 11 Nov 2025 19:28:41 +0800
Subject: [PATCH 098/121] [Metax] update env (#163)

---
 backends/metax_gpu/env.sh | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/backends/metax_gpu/env.sh b/backends/metax_gpu/env.sh
index 1fd07ac5480..c7fcf6622b4 100644
--- a/backends/metax_gpu/env.sh
+++ b/backends/metax_gpu/env.sh
@@ -13,10 +13,8 @@
 # limitations under the License.
 
 DEFAULT_DIR="/opt/maca"
-export MACA_PATH=${1:$DEFAULT_DIR}
-export CUDA_PATH=/workspace/cuda-11.7/
-export PATH=${CUDA_PATH}/bin:${PATH}
+export MACA_PATH=${1:-$DEFAULT_DIR}
+export CUDA_PATH=/usr/local/cuda
 export CUCC_PATH=${MACA_PATH}/tools/cu-bridge
-export PATH=${PATH}:${CUCC_PATH}/tools:${CUCC_PATH}/bin
-export PATH=${MACA_PATH}/bin:${PATH}
-export LD_LIBRARY_PATH=${MACA_PATH}/lib:${MACA_PATH}/mxgpu_llvm/lib:${LD_LIBRARY_PATH}
+export PATH=${CUDA_PATH}/bin:${MACA_PATH}/ompi/bin:${MACA_PATH}/ucx/bin:${MACA_PATH}/mxgpu_llvm/bin:${MACA_PATH}/bin:${CUCC_PATH}/tools:${CUCC_PATH}/bin:${PATH}
+export LD_LIBRARY_PATH=${MACA_PATH}/lib:${MACA_PATH}/ompi/lib:${MACA_PATH}/mxgpu_llvm/lib:${LD_LIBRARY_PATH}

From 6b0bc211eb7e5259b44c3f8005416f84aed85913 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Tue, 11 Nov 2025 19:41:19 +0800
Subject: [PATCH 099/121] [metax] Timed trigger (#164)

---
 .github/workflows/metax_work_private.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/metax_work_private.yaml b/.github/workflows/metax_work_private.yaml
index b4341fa4506..3702a4d887b 100644
--- a/.github/workflows/metax_work_private.yaml
+++ b/.github/workflows/metax_work_private.yaml
@@ -5,8 +5,8 @@ on:
   pull_request:
     types: [opened, synchronize]
     branches: [develop, release/**]
-    schedule:
-      - cron: "0 15 * * *"
+  schedule:
+    - cron: "0 15 * * *"
 permissions: read-all
 
 defaults:

From 87cc5e617b3de035a97d938f259a44b6aea19330 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Wed, 12 Nov 2025 13:34:13 +0800
Subject: [PATCH 100/121] =?UTF-8?q?=E3=80=90Metax=E3=80=91update=20(#165)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 backends/metax_gpu/build_private_CI.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/metax_gpu/build_private_CI.sh b/backends/metax_gpu/build_private_CI.sh
index fabaf1ffc5b..66ee1892fe4 100644
--- a/backends/metax_gpu/build_private_CI.sh
+++ b/backends/metax_gpu/build_private_CI.sh
@@ -88,5 +88,5 @@ cd build/dist/
 ossutil ls oss://opensource-ci/paddle/
 ossutil cat oss://opensource-ci/paddle/
 
-ossutil cp ./paddle_metax_gpu-*.whl oss://opensource-ci/paddle/
+ossutil cp ./paddle_metax_gpu-*.whl oss://opensource-ci/paddle/ -f
 cd -

From a9abecae481298aa7b46a65f4b4b9c7eb81045ee Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Wed, 12 Nov 2025 17:29:30 +0800
Subject: [PATCH 101/121] [Metax] fix version (#166)

---
 .github/workflows/CI.yml                      |   5 +
 .../{metax_work.yaml => _Metax-X86.yaml}      |   0
 ..._private.yaml => _Metax_work_private.yaml} |   0
 backends/metax_gpu/cmake/paddle.cmake         |   3 +-
 backends/metax_gpu/cmake/version.cmake        | 128 +-----------------
 .../elementwise_grad_kernel_register.cu       |   4 +
 6 files changed, 11 insertions(+), 129 deletions(-)
 rename .github/workflows/{metax_work.yaml => _Metax-X86.yaml} (100%)
 rename .github/workflows/{metax_work_private.yaml => _Metax_work_private.yaml} (100%)
 mode change 100755 => 100644 backends/metax_gpu/cmake/paddle.cmake
 mode change 100755 => 120000 backends/metax_gpu/cmake/version.cmake

diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 649f24cfd53..a46be0ee7da 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -32,6 +32,11 @@ jobs:
     uses: ./.github/workflows/_GCU.yml
     needs: [Codestyle-Check]
 
+  Metax:
+    name: Metax-GPU-X86
+    uses: ./.github/workflows/_Metax-X86.yaml
+    needs: [Codestyle-Check]
+
   hpu:
     name: hpu
     uses: ./.github/workflows/_HPU.yml
diff --git a/.github/workflows/metax_work.yaml b/.github/workflows/_Metax-X86.yaml
similarity index 100%
rename from .github/workflows/metax_work.yaml
rename to .github/workflows/_Metax-X86.yaml
diff --git a/.github/workflows/metax_work_private.yaml b/.github/workflows/_Metax_work_private.yaml
similarity index 100%
rename from .github/workflows/metax_work_private.yaml
rename to .github/workflows/_Metax_work_private.yaml
diff --git a/backends/metax_gpu/cmake/paddle.cmake b/backends/metax_gpu/cmake/paddle.cmake
old mode 100755
new mode 100644
index 899ffd2dd30..70420a00f96
--- a/backends/metax_gpu/cmake/paddle.cmake
+++ b/backends/metax_gpu/cmake/paddle.cmake
@@ -1,5 +1,4 @@
-# 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights
-# Reserved. Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not
 # use this file except in compliance with the License. You may obtain a copy of
diff --git a/backends/metax_gpu/cmake/version.cmake b/backends/metax_gpu/cmake/version.cmake
deleted file mode 100755
index fcf73828ea8..00000000000
--- a/backends/metax_gpu/cmake/version.cmake
+++ /dev/null
@@ -1,127 +0,0 @@
-# 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights
-# Reserved. Get the latest git tag.
-set(PADDLE_VERSION $ENV{PADDLE_VERSION})
-if(WITH_NIGHTLY_BUILD)
-  execute_process(
-    COMMAND ${GIT_EXECUTABLE} show -s --format=%ci HEAD
-    WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
-    OUTPUT_VARIABLE GIT_COMMIT_TIME
-    OUTPUT_STRIP_TRAILING_WHITESPACE)
-  string(REGEX REPLACE " (.*)$" "" DATE_ONLY "${GIT_COMMIT_TIME}")
-  string(REPLACE "-" "" DATE_ONLY "${DATE_ONLY}")
-  # Print the last commit date
-  message(STATUS "Last commit date: ${DATE_ONLY}")
-  set(PADDLE_VERSION "${PADDLE_VERSION}.dev${DATE_ONLY}")
-endif()
-set(tmp_version "HEAD")
-set(TAG_VERSION_REGEX "[0-9]+\\.[0-9]+\\.[0-9]+(\\.(a|b|rc)\\.[0-9]+)?")
-set(COMMIT_VERSION_REGEX "[0-9a-f]+[0-9a-f]+[0-9a-f]+[0-9a-f]+[0-9a-f]+")
-while("${PADDLE_VERSION}" STREQUAL "")
-  # Check current branch name
-  execute_process(
-    COMMAND ${GIT_EXECUTABLE} rev-parse --abbrev-ref ${tmp_version}
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
-    OUTPUT_VARIABLE GIT_BRANCH_NAME
-    RESULT_VARIABLE GIT_BRANCH_RESULT
-    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-  if(NOT ${GIT_BRANCH_RESULT})
-    execute_process(
-      COMMAND ${GIT_EXECUTABLE} describe --tags --abbrev=0 --always
-              ${tmp_version}
-      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
-      OUTPUT_VARIABLE GIT_TAG_NAME
-      RESULT_VARIABLE GIT_RESULT
-      ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-    if(NOT ${GIT_RESULT})
-      # Check if current branch is release branch
-      if(${GIT_BRANCH_NAME} MATCHES "release/${TAG_VERSION_REGEX}")
-        # Check the tag is a correct version
-        if(${GIT_TAG_NAME} MATCHES "${COMMIT_VERSION_REGEX}")
-          # if no tag was found, set PADDLE_VERSION to 0.0.0 to represent latest
-          set(PADDLE_VERSION "0.0.0")
-        elseif(${GIT_TAG_NAME} MATCHES "v${TAG_VERSION_REGEX}")
-          string(REPLACE "v" "" PADDLE_VERSION ${GIT_TAG_NAME})
-        else() # otherwise, get the previous git tag name.
-          set(tmp_version "${GIT_TAG_NAME}~1")
-        endif()
-      else()
-        execute_process(
-          COMMAND ${GIT_EXECUTABLE} describe --exact-match --tags ${tmp_version}
-          WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
-          OUTPUT_VARIABLE GIT_EXACT_TAG_NAME
-          RESULT_VARIABLE GIT_EXACT_TAG_RESULT
-          ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-        if(NOT ${GIT_EXACT_TAG_NAME})
-          # Check if current branch is tag branch
-          if(${GIT_EXACT_TAG_NAME} MATCHES "v${TAG_VERSION_REGEX}")
-            string(REPLACE "v" "" PADDLE_VERSION ${GIT_EXACT_TAG_NAME})
-          else()
-            set(PADDLE_VERSION "0.0.0")
-          endif()
-        else()
-          # otherwise, we always set PADDLE_VERSION to 0.0.0 to represent latest
-          set(PADDLE_VERSION "0.0.0")
-        endif()
-      endif()
-    else()
-      set(PADDLE_VERSION "0.0.0")
-      message(WARNING "Cannot add paddle version from git tag")
-    endif()
-  else()
-    set(PADDLE_VERSION "0.0.0")
-    message(WARNING "Cannot add paddle version for wrong git branch result")
-  endif()
-endwhile()
-
-string(REPLACE "-" "." PADDLE_VER_LIST ${PADDLE_VERSION})
-string(REPLACE "." ";" PADDLE_VER_LIST ${PADDLE_VER_LIST})
-list(GET PADDLE_VER_LIST 0 PADDLE_MAJOR_VER)
-list(GET PADDLE_VER_LIST 1 PADDLE_MINOR_VER)
-list(GET PADDLE_VER_LIST 2 PADDLE_PATCH_VER)
-
-math(EXPR PADDLE_VERSION_INTEGER "${PADDLE_MAJOR_VER} * 1000000
-    + ${PADDLE_MINOR_VER} * 1000 + ${PADDLE_PATCH_VER}")
-
-add_definitions(-DPADDLE_VERSION=${PADDLE_VERSION})
-add_definitions(-DPADDLE_VERSION_INTEGER=${PADDLE_VERSION_INTEGER})
-message(STATUS "Paddle version is ${PADDLE_VERSION}")
-
-# write paddle version
-function(version version_file)
-  execute_process(
-    COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
-    OUTPUT_VARIABLE PADDLE_GIT_COMMIT)
-  file(
-    WRITE ${version_file}
-    "Paddle version: ${PADDLE_VERSION}\n"
-    "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n"
-    "WITH_MKL: ${WITH_MKL}\n"
-    "WITH_ONEDNN: ${WITH_ONEDNN}\n"
-    "WITH_OPENVINO: ${WITH_OPENVINO}\n"
-    "WITH_GPU: ${WITH_GPU}\n"
-    "WITH_ROCM: ${WITH_ROCM}\n"
-    "WITH_IPU: ${WITH_IPU}\n")
-  if(WITH_GPU)
-    file(APPEND ${version_file}
-         "CUDA version: ${CUDA_VERSION}\n"
-         "CUDNN version: v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}\n")
-  endif()
-  if(WITH_ROCM)
-    file(APPEND ${version_file}
-         "HIP version: v${HIP_MAJOR_VERSION}.${HIP_MINOR_VERSION}\n"
-         "MIOpen version: v${MIOPEN_MAJOR_VERSION}.${MIOPEN_MINOR_VERSION}\n")
-  endif()
-  if(WITH_IPU)
-    file(APPEND ${version_file} "PopART version: ${POPART_VERSION}\n")
-  endif()
-  file(APPEND ${version_file}
-       "CXX compiler version: ${CMAKE_CXX_COMPILER_VERSION}\n")
-  if(TENSORRT_FOUND)
-    file(
-      APPEND ${version_file}
-      "WITH_TENSORRT: ${TENSORRT_FOUND}\n"
-      "TensorRT version: v${TENSORRT_MAJOR_VERSION}.${TENSORRT_MINOR_VERSION}.${TENSORRT_PATCH_VERSION}.${TENSORRT_BUILD_VERSION}\n"
-    )
-  endif()
-endfunction()
diff --git a/backends/metax_gpu/cmake/version.cmake b/backends/metax_gpu/cmake/version.cmake
new file mode 120000
index 00000000000..7e86e34994b
--- /dev/null
+++ b/backends/metax_gpu/cmake/version.cmake
@@ -0,0 +1 @@
+../../../cmake/version.cmake
\ No newline at end of file
diff --git a/backends/metax_gpu/kernels/cuda_kernels/elementwise_grad_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/elementwise_grad_kernel_register.cu
index 59baa29634f..d4154ac69a0 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/elementwise_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/elementwise_grad_kernel_register.cu
@@ -13,7 +13,11 @@
 // limitations under the License.
 
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/elementwise_add_grad_kernel.h"
+#include "paddle/phi/kernels/elementwise_divide_grad_kernel.h"
 #include "paddle/phi/kernels/elementwise_grad_kernel.h"
+#include "paddle/phi/kernels/elementwise_multiply_grad_kernel.h"
+#include "paddle/phi/kernels/elementwise_subtract_grad_kernel.h"
 
 PD_CUSTOM_KERNEL_REGISTER(fmax_grad,
                           metax_gpu,

From 35c8a4660f0ac834c669e367749a3aade9a55c0d Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Wed, 12 Nov 2025 18:16:39 +0800
Subject: [PATCH 102/121] [Metax] fix nterpolate_grad_kernel (#167)

---
 backends/metax_gpu/CMakeLists.txt                        | 1 +
 backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index a63ed72b0a9..ecda371f037 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -416,6 +416,7 @@ file(
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/multinomial_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/nll_loss_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/moe_unpermute_kernel.cu
+  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu
   # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/pool_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/logsumexp_grad_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/norm_kernel.cu
diff --git a/backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc b/backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc
index fa2c9e6e8b7..c50833dfa60 100644
--- a/backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc
+++ b/backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc
@@ -181,7 +181,7 @@ void RnnKernel(const Context &dev_ctx,
   else if (mode == "RNN_TANH")
     rnn_mode = miopenRNNTANH;
 #else
-  VLOG(0) << "Leave lstmKernel.11";
+  // VLOG(0) << "Leave lstmKernel.11";
   gpuRNNMode_t rnn_mode = CUDNN_LSTM;
   if (mode == "LSTM")
     rnn_mode = CUDNN_LSTM;
@@ -229,7 +229,7 @@ void RnnKernel(const Context &dev_ctx,
                     common::errors::InvalidArgument(
                         "ROCm do not support SequenceLength yet."));
 #endif
-  VLOG(0) << "Leave lstmKernel.12";
+  // VLOG(0) << "Leave lstmKernel.12";
   std::vector<int> SequenceLength;
   if (has_seq_length) {
     SequenceLength = phi::GetVectorFromTensor<int>(sequence_length.get_ptr());

From b3b861bc554a4c37c25ebc37e96097b7d9fca90b Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Wed, 12 Nov 2025 20:43:06 +0800
Subject: [PATCH 103/121] [metax]fix version.txt (#169)

---
 backends/metax_gpu/version.txt | 1 +
 1 file changed, 1 insertion(+)
 create mode 120000 backends/metax_gpu/version.txt

diff --git a/backends/metax_gpu/version.txt b/backends/metax_gpu/version.txt
new file mode 120000
index 00000000000..2b9ab167213
--- /dev/null
+++ b/backends/metax_gpu/version.txt
@@ -0,0 +1 @@
+../../Paddle/version.txt
\ No newline at end of file

From c3c71e42f9f7c4e4d09e931506dcad316388d14f Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Wed, 12 Nov 2025 20:45:45 +0800
Subject: [PATCH 104/121] test (#170)

---
 .github/workflows/CI.yml | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index a46be0ee7da..649f24cfd53 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -32,11 +32,6 @@ jobs:
     uses: ./.github/workflows/_GCU.yml
     needs: [Codestyle-Check]
 
-  Metax:
-    name: Metax-GPU-X86
-    uses: ./.github/workflows/_Metax-X86.yaml
-    needs: [Codestyle-Check]
-
   hpu:
     name: hpu
     uses: ./.github/workflows/_HPU.yml

From 91741a16319736657e8f9ae9d678c89fb401d4bb Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Wed, 12 Nov 2025 21:06:43 +0800
Subject: [PATCH 105/121] update yaml (#171)

---
 .github/workflows/CI.yml | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index a46be0ee7da..649f24cfd53 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -32,11 +32,6 @@ jobs:
     uses: ./.github/workflows/_GCU.yml
     needs: [Codestyle-Check]
 
-  Metax:
-    name: Metax-GPU-X86
-    uses: ./.github/workflows/_Metax-X86.yaml
-    needs: [Codestyle-Check]
-
   hpu:
     name: hpu
     uses: ./.github/workflows/_HPU.yml

From 6f4b3aac1fbd5ac1a73e566468cc335aca5ef1f6 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Thu, 13 Nov 2025 16:38:45 +0800
Subject: [PATCH 106/121] [Metax]add parameterized (#172)

---
 .github/workflows/_Metax_work_private.yaml |  7 +++++--
 backends/metax_gpu/build.sh                |  9 +--------
 backends/metax_gpu/build_private_CI.sh     | 10 +++++-----
 3 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/_Metax_work_private.yaml b/.github/workflows/_Metax_work_private.yaml
index 3702a4d887b..3c1e163537a 100644
--- a/.github/workflows/_Metax_work_private.yaml
+++ b/.github/workflows/_Metax_work_private.yaml
@@ -6,7 +6,7 @@ on:
     types: [opened, synchronize]
     branches: [develop, release/**]
   schedule:
-    - cron: "0 15 * * *"
+    - cron: "0 16 * * *"
 permissions: read-all
 
 defaults:
@@ -16,7 +16,6 @@ defaults:
 jobs:
   metax-gpu-test:
     runs-on: paddle-metax-runner-set
-    # runs-on: debug-paddle-runner-set
     steps:
       - name: Checkout repository
         run: |
@@ -96,3 +95,7 @@ jobs:
           fi
           cp backends/metax_gpu/build/dist/paddle_metax_gpu*.whl .
           python BosClient.py paddle_metax_gpu*.whl paddle-github-action/PaddleCustomDevice/metax_gpu/${PR_ID}/${COMMIT_ID}
+          cd backends/metax_gpu/build/dist/
+          ossutil ls oss://opensource-ci/paddle/
+          ossutil cat oss://opensource-ci/paddle/
+          ossutil cp ./paddle_metax_gpu-*.whl oss://opensource-ci/paddle/ -f
diff --git a/backends/metax_gpu/build.sh b/backends/metax_gpu/build.sh
index 9ca589a7807..6e1cdef268f 100755
--- a/backends/metax_gpu/build.sh
+++ b/backends/metax_gpu/build.sh
@@ -23,21 +23,14 @@ pip  uninstall paddlepaddle -y
 # init paddle
 # git submodule sync --recursive && git submodule update --init --recursive
 
-# sleep 1000000
-# unset http_proxy https_proxy
 
-
-# export http_proxy=https://172.17.0.1:1080 https_proxy=http://10.2.192.21:1080
-# export
-pip install safetensors==0.6.2 -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple some-package
+pip install parameterized safetensors==0.6.2 -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple some-package
 # install paddle
 
 
 python -m pip install --pre paddlepaddle -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/
 
 
-# unset http_proxy https_proxy
-
 # apply patch
 bash change_patch.sh
 
diff --git a/backends/metax_gpu/build_private_CI.sh b/backends/metax_gpu/build_private_CI.sh
index 66ee1892fe4..9a1a772793e 100644
--- a/backends/metax_gpu/build_private_CI.sh
+++ b/backends/metax_gpu/build_private_CI.sh
@@ -84,9 +84,9 @@ pip install dist/paddle_metax_gpu*.whl --force-reinstall
 cd ..
 echo "Done!"
 
-cd build/dist/
-ossutil ls oss://opensource-ci/paddle/
-ossutil cat oss://opensource-ci/paddle/
+# cd build/dist/
+# ossutil ls oss://opensource-ci/paddle/
+# ossutil cat oss://opensource-ci/paddle/
 
-ossutil cp ./paddle_metax_gpu-*.whl oss://opensource-ci/paddle/ -f
-cd -
+# ossutil cp ./paddle_metax_gpu-*.whl oss://opensource-ci/paddle/ -f
+# cd -

From 60de38ed88bbabba48bbd4c136d28d39ac7899db Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Mon, 17 Nov 2025 11:25:43 +0800
Subject: [PATCH 107/121] [Metax] Assign data stream to CUDA (#174)

---
 .github/workflows/_Metax-X86.yaml          | 2 +-
 .github/workflows/_Metax_work_private.yaml | 2 +-
 backends/metax_gpu/runtime/runtime.cc      | 9 ++++++---
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/_Metax-X86.yaml b/.github/workflows/_Metax-X86.yaml
index 486236955ad..3d2b6cb65fc 100644
--- a/.github/workflows/_Metax-X86.yaml
+++ b/.github/workflows/_Metax-X86.yaml
@@ -78,7 +78,7 @@ jobs:
 
         run: |
           cd backends/metax_gpu/tests
-          bash run_test.sh -j 8
+          bash run_test.sh -j 32
 
       - name: push whl
         env:
diff --git a/.github/workflows/_Metax_work_private.yaml b/.github/workflows/_Metax_work_private.yaml
index 3c1e163537a..fc65426c99c 100644
--- a/.github/workflows/_Metax_work_private.yaml
+++ b/.github/workflows/_Metax_work_private.yaml
@@ -79,7 +79,7 @@ jobs:
 
         run: |
           cd backends/metax_gpu/tests
-          bash run_test.sh -j 8
+          bash run_test.sh -j 32
 
       - name: push whl
         env:
diff --git a/backends/metax_gpu/runtime/runtime.cc b/backends/metax_gpu/runtime/runtime.cc
index 494b1a71258..9460cf574da 100644
--- a/backends/metax_gpu/runtime/runtime.cc
+++ b/backends/metax_gpu/runtime/runtime.cc
@@ -579,7 +579,8 @@ C_Status AsyncMemCpyH2D(const C_Device device,
     return C_ERROR;
   }
 
-  cudaErr = cudaMemcpyAsync(dst, src, size, cudaMemcpyHostToDevice);
+  cudaErr = cudaMemcpyAsync(
+      dst, src, size, cudaMemcpyHostToDevice, (cudaStream_t)stream);
   if (cudaErr != cudaSuccess) {
     return C_ERROR;
   }
@@ -605,7 +606,8 @@ C_Status AsyncMemCpyD2H(const C_Device device,
     return C_ERROR;
   }
 
-  cudaErr = cudaMemcpyAsync(dst, src, size, cudaMemcpyDeviceToHost);
+  cudaErr = cudaMemcpyAsync(
+      dst, src, size, cudaMemcpyDeviceToHost, (cudaStream_t)stream);
   if (cudaErr != cudaSuccess) {
     return C_ERROR;
   }
@@ -633,7 +635,8 @@ C_Status AsyncMemCpyD2D(const C_Device device,
     return C_ERROR;
   }
 
-  cudaErr = cudaMemcpyAsync(dst, src, size, cudaMemcpyDeviceToDevice);
+  cudaErr = cudaMemcpyAsync(
+      dst, src, size, cudaMemcpyDeviceToDevice, (cudaStream_t)stream);
   if (cudaErr != cudaSuccess) {
     return C_ERROR;
   }

From 564253d64bf5069fc99f677bd9d373539ed6f580 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Mon, 17 Nov 2025 13:38:52 +0800
Subject: [PATCH 108/121] [Metax] fix CUDA Kernel No.50 (#175)

---
 .../embedding_with_scaled_gradient_grad_kernel_register.cu     | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/backends/metax_gpu/kernels/cuda_kernels/embedding_with_scaled_gradient_grad_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/embedding_with_scaled_gradient_grad_kernel_register.cu
index 46d10ade577..ec6ff5f053d 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/embedding_with_scaled_gradient_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/embedding_with_scaled_gradient_grad_kernel_register.cu
@@ -16,8 +16,7 @@
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/gpu/embedding_with_scaled_gradient_grad_kernel.h"  // NOLINT
-
+#include "paddle/phi/kernels/embedding_with_scaled_gradient_grad_kernel.h"  // NOLINT
 PD_CUSTOM_KERNEL_REGISTER(embedding_with_scaled_gradient_grad,
                           metax_gpu,
                           ALL_LAYOUT,

From cd3f42d70597afd4d8099861b0df8fb0455e6a5c Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Mon, 17 Nov 2025 14:23:22 +0800
Subject: [PATCH 109/121] [metax] change yaml (#176)

---
 .github/workflows/_Metax-X86.yaml          | 2 +-
 .github/workflows/_Metax_work_private.yaml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/_Metax-X86.yaml b/.github/workflows/_Metax-X86.yaml
index 3d2b6cb65fc..a999a9ddb5d 100644
--- a/.github/workflows/_Metax-X86.yaml
+++ b/.github/workflows/_Metax-X86.yaml
@@ -78,7 +78,7 @@ jobs:
 
         run: |
           cd backends/metax_gpu/tests
-          bash run_test.sh -j 32
+          bash run_test.sh -j 16
 
       - name: push whl
         env:
diff --git a/.github/workflows/_Metax_work_private.yaml b/.github/workflows/_Metax_work_private.yaml
index fc65426c99c..637bf8d5f34 100644
--- a/.github/workflows/_Metax_work_private.yaml
+++ b/.github/workflows/_Metax_work_private.yaml
@@ -79,7 +79,7 @@ jobs:
 
         run: |
           cd backends/metax_gpu/tests
-          bash run_test.sh -j 32
+          bash run_test.sh -j 16
 
       - name: push whl
         env:

From ddc8d1dc709198012f491b9f099695f172fe6cd3 Mon Sep 17 00:00:00 2001
From: ZhouDuan <136539532+1184319564@users.noreply.github.com>
Date: Mon, 17 Nov 2025 15:38:05 +0800
Subject: [PATCH 110/121] [metax] Add some tests for CI (#173)

---
 backends/metax_gpu/tests/ignore.txt | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/backends/metax_gpu/tests/ignore.txt b/backends/metax_gpu/tests/ignore.txt
index 2b0fae559e6..215280b8cb8 100644
--- a/backends/metax_gpu/tests/ignore.txt
+++ b/backends/metax_gpu/tests/ignore.txt
@@ -1,32 +1,14 @@
 test_matmul_op_metax
 test_sum_op
-test_max_op
-test_cumsum_op
-test_softmax_with_cross_entropy_op
-test_softmax_op
 test_elementwise_add_op
 test_gather_op
 test_elementwise_pow_op
 test_layer_norm_op
-test_index_add_op
 test_elementwise_div_op
-test_stack_op
-test_logical_op
 test_mean_op
-test_transpose_op
-test_randint_op
-test_uniform_random_op
 test_c_embedding_op
-test_slice_op
 test_compare_op
 test_conv3d_transpose_op
 test_conv3d_layer
 test_conv3d_transpose_part2_op
 test_fused_conv2d_add_act_op
-test_swiglu_metax
-test_squared_l2_norm_op
-test_dygraph_spectral_norm
-test_bincount_op
-test_adamw_op
-test_einsum_op
-test_complex_matmul

From 3411e401775a44c7f9f572f1de6b3639b22d3ff1 Mon Sep 17 00:00:00 2001
From: metax666 <metax_pde@outlook.com>
Date: Tue, 18 Nov 2025 10:30:56 +0800
Subject: [PATCH 111/121] Change test script to use 8 jobs instead of 16

---
 .github/workflows/_Metax_work_private.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/_Metax_work_private.yaml b/.github/workflows/_Metax_work_private.yaml
index 637bf8d5f34..3c1e163537a 100644
--- a/.github/workflows/_Metax_work_private.yaml
+++ b/.github/workflows/_Metax_work_private.yaml
@@ -79,7 +79,7 @@ jobs:
 
         run: |
           cd backends/metax_gpu/tests
-          bash run_test.sh -j 16
+          bash run_test.sh -j 8
 
       - name: push whl
         env:

From 0cc416ad7d15ed7f3096e431cfe347e9dfc92ba6 Mon Sep 17 00:00:00 2001
From: ZhouDuan <136539532+1184319564@users.noreply.github.com>
Date: Wed, 19 Nov 2025 18:50:01 +0800
Subject: [PATCH 112/121] =?UTF-8?q?=E3=80=90Metax=E3=80=91fix=20patch=20(#?=
 =?UTF-8?q?178)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 backends/metax_gpu/build.sh            |  6 +++++-
 backends/metax_gpu/build_in_metax.sh   |  3 +++
 backends/metax_gpu/build_private_CI.sh |  6 +++++-
 backends/metax_gpu/patch/paddle.patch  | 24 ++++++++++++------------
 backends/metax_gpu/requirement.txt     |  3 +++
 backends/metax_gpu/tests/run_test.sh   |  2 +-
 6 files changed, 29 insertions(+), 15 deletions(-)
 create mode 100644 backends/metax_gpu/requirement.txt

diff --git a/backends/metax_gpu/build.sh b/backends/metax_gpu/build.sh
index 6e1cdef268f..417f50e2a72 100755
--- a/backends/metax_gpu/build.sh
+++ b/backends/metax_gpu/build.sh
@@ -16,6 +16,10 @@
 # limitations under the License.
 
 set -e
+
+# install requirement.txt
+pip install -r requirement.txt -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+
 # uninstall paddle
 pip  uninstall paddlepaddle -y
 
@@ -24,7 +28,7 @@ pip  uninstall paddlepaddle -y
 # git submodule sync --recursive && git submodule update --init --recursive
 
 
-pip install parameterized safetensors==0.6.2 -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple some-package
+# pip install parameterized safetensors==0.6.2 -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple some-package
 # install paddle
 
 
diff --git a/backends/metax_gpu/build_in_metax.sh b/backends/metax_gpu/build_in_metax.sh
index 67ec1a2c31c..9be248dbf1d 100644
--- a/backends/metax_gpu/build_in_metax.sh
+++ b/backends/metax_gpu/build_in_metax.sh
@@ -17,6 +17,9 @@
 
 set -e
 
+# install requirement.txt
+pip install -r requirement.txt -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+
 # init paddle
 git submodule sync --recursive && git submodule update --init --recursive
 
diff --git a/backends/metax_gpu/build_private_CI.sh b/backends/metax_gpu/build_private_CI.sh
index 9a1a772793e..5d5c9f3f9a4 100644
--- a/backends/metax_gpu/build_private_CI.sh
+++ b/backends/metax_gpu/build_private_CI.sh
@@ -16,6 +16,10 @@
 # limitations under the License.
 
 set -e
+
+# install requirement.txt
+pip install -r requirement.txt -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+
 # uninstall paddle
 pip  uninstall paddlepaddle -y
 
@@ -49,7 +53,7 @@ echo "✅ 脚本执行完毕！"
 echo "📌 已撤销本地修改，并更新到 Paddle 最新的 develop (dev) 分支代码。"
 
 
-pip install parameterized safetensors==0.6.2 -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple some-package
+# pip install parameterized safetensors==0.6.2 -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple some-package
 # install paddle
 
 python -m pip install --pre paddlepaddle -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/
diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch
index 8cd18045094..c9390e0c4d7 100755
--- a/backends/metax_gpu/patch/paddle.patch
+++ b/backends/metax_gpu/patch/paddle.patch
@@ -229,7 +229,7 @@ index c5309e7e11..3328571380 100644
      }                                                              \
    };                                                               \
 diff --git a/paddle/phi/backends/gpu/cuda/cuda_device_function.h b/paddle/phi/backends/gpu/cuda/cuda_device_function.h
-index 4ff2e528a9..23f7f4b583 100644
+index 092365a961..23d3b65dc6 100644
 --- a/paddle/phi/backends/gpu/cuda/cuda_device_function.h
 +++ b/paddle/phi/backends/gpu/cuda/cuda_device_function.h
 @@ -1,3 +1,4 @@
@@ -237,7 +237,7 @@ index 4ff2e528a9..23f7f4b583 100644
  /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
  
  Licensed under the Apache License, Version 2.0 (the "License");
-@@ -25,7 +26,7 @@ namespace phi {
+@@ -23,7 +24,7 @@ namespace phi {
  namespace backends {
  namespace gpu {
  
@@ -246,7 +246,7 @@ index 4ff2e528a9..23f7f4b583 100644
  #define CREATE_SHFL_MASK(mask, predicate) \
    mask = __ballot_sync(FULL_WARP_MASK, (predicate))
  
-@@ -45,12 +46,12 @@ namespace gpu {
+@@ -43,12 +44,12 @@ namespace gpu {
  
  template <typename T>
  __forceinline__ __device__ T
@@ -261,7 +261,7 @@ index 4ff2e528a9..23f7f4b583 100644
                                                  T val,
                                                  int width = warpSize) {
    return __shfl_xor_sync(mask, val, width);
-@@ -58,14 +59,14 @@ __forceinline__ __device__ T CudaShuffleXorSync(unsigned mask,
+@@ -56,14 +57,14 @@ __forceinline__ __device__ T CudaShuffleXorSync(unsigned mask,
  
  template <>
  __forceinline__ __device__ phi::dtype::float16 CudaShuffleDownSync(
@@ -278,7 +278,7 @@ index 4ff2e528a9..23f7f4b583 100644
  #if defined(PADDLE_CUDA_BF16)
    return phi::dtype::bfloat16(__shfl_down_sync(
        mask, val.to_nv_bfloat16(), static_cast<unsigned>(delta), width));
-@@ -77,7 +78,7 @@ __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleDownSync(
+@@ -75,7 +76,7 @@ __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleDownSync(
  
  template <>
  __forceinline__ __device__ phi::dtype::complex<float> CudaShuffleDownSync(
@@ -287,7 +287,7 @@ index 4ff2e528a9..23f7f4b583 100644
    float real = static_cast<float>(__shfl_down_sync(
        mask, static_cast<float>(val.real), static_cast<unsigned>(delta), width));
    float imag = static_cast<float>(__shfl_down_sync(
-@@ -87,7 +88,7 @@ __forceinline__ __device__ phi::dtype::complex<float> CudaShuffleDownSync(
+@@ -85,7 +86,7 @@ __forceinline__ __device__ phi::dtype::complex<float> CudaShuffleDownSync(
  
  template <>
  __forceinline__ __device__ phi::dtype::complex<double> CudaShuffleDownSync(
@@ -296,7 +296,7 @@ index 4ff2e528a9..23f7f4b583 100644
    double real =
        static_cast<double>(__shfl_down_sync(mask,
                                             static_cast<double>(val.real),
-@@ -103,13 +104,13 @@ __forceinline__ __device__ phi::dtype::complex<double> CudaShuffleDownSync(
+@@ -101,20 +102,20 @@ __forceinline__ __device__ phi::dtype::complex<double> CudaShuffleDownSync(
  
  template <>
  __forceinline__ __device__ phi::dtype::float16 CudaShuffleXorSync(
@@ -309,10 +309,9 @@ index 4ff2e528a9..23f7f4b583 100644
  __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleXorSync(
 -    unsigned mask, phi::dtype::bfloat16 val, int width) {
 +    unsigned long long mask, phi::dtype::bfloat16 val, int width) {
- #if defined(PADDLE_CUDA_BF16)
    return phi::dtype::bfloat16(
        __shfl_xor_sync(mask, val.to_nv_bfloat16(), width));
-@@ -121,7 +122,7 @@ __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleXorSync(
+ }
  
  template <>
  __forceinline__ __device__ phi::dtype::complex<float> CudaShuffleXorSync(
@@ -321,7 +320,7 @@ index 4ff2e528a9..23f7f4b583 100644
    float real = static_cast<float>(
        __shfl_xor_sync(mask, static_cast<float>(val.real), width));
    float imag = static_cast<float>(
-@@ -131,7 +132,7 @@ __forceinline__ __device__ phi::dtype::complex<float> CudaShuffleXorSync(
+@@ -124,7 +125,7 @@ __forceinline__ __device__ phi::dtype::complex<float> CudaShuffleXorSync(
  
  template <>
  __forceinline__ __device__ phi::dtype::complex<double> CudaShuffleXorSync(
@@ -330,7 +329,7 @@ index 4ff2e528a9..23f7f4b583 100644
    double real = static_cast<double>(
        __shfl_xor_sync(mask, static_cast<double>(val.real), width));
    double imag = static_cast<double>(
-@@ -141,7 +142,7 @@ __forceinline__ __device__ phi::dtype::complex<double> CudaShuffleXorSync(
+@@ -134,7 +135,7 @@ __forceinline__ __device__ phi::dtype::complex<double> CudaShuffleXorSync(
  
  template <typename T>
  __forceinline__ __device__ T
@@ -339,7 +338,7 @@ index 4ff2e528a9..23f7f4b583 100644
    return __shfl_sync(mask, val, src_line, width);
  }
  
-@@ -160,7 +161,7 @@ __device__ T reduceSum(T val, int tid, int len) {
+@@ -153,7 +154,7 @@ __device__ T reduceSum(T val, int tid, int len) {
    // but most card's warp size is 32.
    const int warpSize = 32;
    __shared__ T shm[warpSize];
@@ -348,6 +347,7 @@ index 4ff2e528a9..23f7f4b583 100644
    CREATE_SHFL_MASK(mask, tid < len);
  
    for (int offset = warpSize / 2; offset > 0; offset /= 2)
+
 diff --git a/paddle/phi/core/enforce.h b/paddle/phi/core/enforce.h
 index 024a7de73e..66b373d698 100644
 --- a/paddle/phi/core/enforce.h
diff --git a/backends/metax_gpu/requirement.txt b/backends/metax_gpu/requirement.txt
new file mode 100644
index 00000000000..8e45c236cfc
--- /dev/null
+++ b/backends/metax_gpu/requirement.txt
@@ -0,0 +1,3 @@
+parameterized
+safetensors==0.6.2
+scipy
diff --git a/backends/metax_gpu/tests/run_test.sh b/backends/metax_gpu/tests/run_test.sh
index 31b175a60bc..609c3ae7540 100755
--- a/backends/metax_gpu/tests/run_test.sh
+++ b/backends/metax_gpu/tests/run_test.sh
@@ -15,7 +15,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-pip install scipy -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple some-package
+# pip install scipy -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple some-package
 SCRIPT_DIR=$(dirname "$0")
 LEGACY_TEST_PATH="${SCRIPT_DIR}/../../../Paddle/test/legacy_test"
 TEST_PATH1="${SCRIPT_DIR}/../../../python"

From ee88ab165cf253db6931943e0b86520abe80b7ff Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Fri, 21 Nov 2025 17:22:23 +0800
Subject: [PATCH 113/121] [METAX] Modify CI logic (#179)

---
 .github/workflows/CI.yml          |   5 ++
 .github/workflows/_Metax-X86.yaml | 120 +++++++++++++++---------------
 2 files changed, 66 insertions(+), 59 deletions(-)

diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 649f24cfd53..b8c8106d198 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -47,6 +47,11 @@ jobs:
     uses: ./.github/workflows/_IXUCA.yml
     needs: [Codestyle-Check]
 
+  metax:
+    name: metax
+    uses: ./.github/workflows/_Metax-X86.yaml
+    needs: [Codestyle-Check]
+
   #sdaa:
     #name: sdaa
     #uses: ./.github/workflows/_SDAA.yml
diff --git a/.github/workflows/_Metax-X86.yaml b/.github/workflows/_Metax-X86.yaml
index 486236955ad..df8b320010a 100644
--- a/.github/workflows/_Metax-X86.yaml
+++ b/.github/workflows/_Metax-X86.yaml
@@ -1,11 +1,19 @@
 name: paddle metax gpu test
 
 on:
-  workflow_dispatch:
-  pull_request:
-    types: [opened, synchronize]
-    branches: [develop, release/**]
-permissions: read-all
+  workflow_call:
+    inputs:
+      workflow-name:
+        type: string
+        required: false
+      clone_dir:
+        type: string
+        required: false
+        default: 'PaddlecustomDevice'
+      is_pr:
+        type: string
+        required: false
+        default: 'true'
 
 defaults:
   run:
@@ -13,71 +21,65 @@ defaults:
 
 jobs:
   metax-gpu-test:
-    runs-on: paddle-metax-runner-set
-    # runs-on: debug-paddle-runner-set
+    env:
+      PR_ID: ${{ github.event.pull_request.number }}
+      COMMIT_ID: ${{ github.event.pull_request.head.sha }}
+      BRANCH: develop
+      runs-on: paddle-metax-runner-set
+      # runs-on: debug-paddle-runner-set
     steps:
       - name: Checkout repository
         run: |
-          git config --global user.name "GitHub Actions"
-          git config --global user.email "actions@github.com"
-
-          git clone \
-            --reference-if-able /home/runner/PaddleCustomDevice \
-            --depth=1 \
-            --shallow-submodules \
-            --jobs=8 \
-            --branch ${{ github.base_ref || github.ref_name}} \
-            --recurse-submodules \
-            https://${{ github.actor }}:${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }}.git .
-
-          if [ "${{ github.event_name }}" == "pull_request" ]; then
-            git fetch origin pull/${{ github.event.pull_request.number }}/head:pull/${{ github.event.pull_request.number }}/head
-            git checkout pull/${{ github.event.pull_request.number }}/head
-
-
-
-
-            paddle_branch=${{ github.base_ref || github.ref_name}}
-            echo $paddle_branch
-            # sleep 10000
-            change_numbers=$(git diff --name-only remotes/origin/${paddle_branch} | wc -l)
-            echo $change_numbers
-
-
-            change_backend=$(git diff --name-only remotes/origin/${paddle_branch} | grep -c "backends/" || true)
-            echo $change_backend
-            change_metax_only=$(git diff --name-only remotes/origin/${paddle_branch} | grep -c "backends/metax_gpu" || true)
-            echo $change_metax_only
-
-            # change_backend=$(git diff --name-only remotes/origin/${paddle_branch} | grep "backends/"| wc -l)
-            # echo $change_backend
-            # change_metax_only=$(git diff --name-only remotes/origin/${paddle_branch} | grep "backends/metax_gpu"| wc -l)
-            # echo $change_metax_only
-
-            git diff --name-only remotes/origin/${paddle_branch}
-
-            if [ $change_numbers -ne $change_backend ]; then
-              echo "Common file changed, continue to run metax FULL CI test ..."
-            elif [ $paddle_branch -eq 0 ] ; then
-              echo "NO metax backend changes found, skip metax FULL CI ....."
-              exit 0
-            fi
-
-
-            # git submodule update --init --recursive
+          set -x
+
+          wget -q --tries=5 --no-proxy https://paddle-github-action.bj.bcebos.com/PaddleCustomDevice/PR/${PR_ID}/${COMMIT_ID}/PaddleCustomDevice.tar.gz --no-check-certificate
+          echo "Extracting PaddleCustomDevice.tar.gz"
+          tar -xf PaddleCustomDevice.tar.gz
+          cd PaddleCustomDevice
+          git config --global --add safe.directory "*"
+          git remote add upstream https://github.com/PaddlePaddle/PaddleCustomDevice.git
+          git merge ${BRANCH} --no-edit
+          git --no-pager log --pretty=oneline -5'
+
+      - name: Check bypass
+        id: check-bypass
+        uses: ./PaddleCustomDevice/.github/actions/check-bypass
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          workflow-name: metax
+
+      - name: RUN METAX-GPU
+        id: run-metax
+        if: steps.check-bypass.outputs.can-skip != 'true'
+        run: |
+          cd PaddleCustomDevice
+          # !!!!! SKIP IF NO METAX CHANGE !!!!
+          echo "=========== Checking PR Changes If METAX FULL CI Needed ==========="
+          change_numbers=$(git diff --name-only remotes/origin/${BRANCH} | wc -l)
+          change_backend=$(git diff --name-only remotes/origin/${BRANCH} | grep "backends/"| wc -l)
+          change_metax_only=$(git diff --name-only remotes/origin/${BRANCH} | grep "backends/metax_gpu"| wc -l)
+          git --no-pager diff --name-only remotes/origin/${BRANCH}
+
+          if [ $change_numbers -ne $change_backend ]; then
+            echo "Common file changed, continue to run METAX FULL CI test ..."
+            echo "should_skip=false" >> $GITHUB_OUTPUT
+          elif [ $change_metax_only -eq 0 ] ; then
+            echo "NO METAX backend changes found, skip METAX FULL CI ...."
+            echo "should_skip=true" >> $GITHUB_OUTPUT
+            exit 0
+          else
+            echo "should_skip=false" >> $GITHUB_OUTPUT
           fi
 
-
       - name: compile
         run: |
-          # sleep 10000
-          cd backends/metax_gpu
+          cd PaddleCustomDevice/backends/metax_gpu
           bash build.sh
 
       - name: run test
 
         run: |
-          cd backends/metax_gpu/tests
+          cd PaddleCustomDevice/backends/metax_gpu/tests
           bash run_test.sh -j 8
 
       - name: push whl
@@ -92,5 +94,5 @@ jobs:
             wget -q --no-proxy https://xly-devops.bj.bcebos.com/home/bos_retry.tar.gz --no-check-certificate
             tar xf bos_retry.tar.gz
           fi
-          cp backends/metax_gpu/build/dist/paddle_metax_gpu*.whl .
+          cp PaddleCustomDevice/backends/metax_gpu/build/dist/paddle_metax_gpu*.whl .
           python BosClient.py paddle_metax_gpu*.whl paddle-github-action/PaddleCustomDevice/metax_gpu/${PR_ID}/${COMMIT_ID}

From 4c42030b3f04532cc716d2cff7e677af765b37db Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Fri, 21 Nov 2025 19:18:40 +0800
Subject: [PATCH 114/121] [Metax] fix patch (#180)

---
 backends/metax_gpu/patch/paddle.patch | 52 ++++++++++++++-------------
 1 file changed, 27 insertions(+), 25 deletions(-)

diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch
index c9390e0c4d7..bc74c4e44da 100755
--- a/backends/metax_gpu/patch/paddle.patch
+++ b/backends/metax_gpu/patch/paddle.patch
@@ -19,10 +19,10 @@ index cfada544d4..a690e97d74 100644
  
  set(EIGEN_INCLUDE_DIR ${SOURCE_DIR})
 diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt
-index 99a0116d92..2566e7c41a 100755
+index 8d445b39ae..504e7b6293 100755
 --- a/paddle/fluid/operators/fused/CMakeLists.txt
 +++ b/paddle/fluid/operators/fused/CMakeLists.txt
-@@ -43,6 +43,11 @@ if(WITH_GPU OR WITH_ROCM)
+@@ -39,6 +39,11 @@ if(WITH_GPU OR WITH_ROCM)
      op_library(fused_multi_transformer_int8_op)
    endif()
  
@@ -34,19 +34,6 @@ index 99a0116d92..2566e7c41a 100755
    if(CUDA_VERSION GREATER_EQUAL 11.6)
      op_library(fused_gemm_epilogue_op)
    endif()
-diff --git a/paddle/fluid/platform/profiler/cupti_data_process.cc b/paddle/fluid/platform/profiler/cupti_data_process.cc
-index bff0f2bf70..9376b5781f 100644
---- a/paddle/fluid/platform/profiler/cupti_data_process.cc
-+++ b/paddle/fluid/platform/profiler/cupti_data_process.cc
-@@ -16,7 +16,7 @@
- 
- #include <cstdio>
- 
--#include "paddle/fluid/platform/enforce.h"
-+// #include "paddle/fluid/platform/enforce.h"
- #include "paddle/phi/core/os_info.h"
- #include "paddle/phi/core/platform/device/gpu/gpu_info.h"
- #include "paddle/phi/core/platform/profiler/utils.h"
 diff --git a/paddle/phi/backends/dynload/cublas.h b/paddle/phi/backends/dynload/cublas.h
 index bda9cbe17e..c73eba9c8a 100644
 --- a/paddle/phi/backends/dynload/cublas.h
@@ -98,7 +85,7 @@ index 8b2e08c777..ca926df151 100644
  #define CUBLASLT_BLAS_ROUTINE_EACH(__macro)      \
    __macro(cublasLtCreate);                       \
 diff --git a/paddle/phi/backends/dynload/cudnn.h b/paddle/phi/backends/dynload/cudnn.h
-index a943bbed9a..af931490e3 100644
+index ad2ada9dfa..9e8389e7dc 100644
 --- a/paddle/phi/backends/dynload/cudnn.h
 +++ b/paddle/phi/backends/dynload/cudnn.h
 @@ -38,7 +38,10 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
@@ -134,7 +121,7 @@ index 1547909d92..ef20838434 100644
      }                                                                \
    };                                                                 \
 diff --git a/paddle/phi/backends/dynload/cupti.h b/paddle/phi/backends/dynload/cupti.h
-index 59e92955c9..d2f8c2da15 100644
+index 4241a512e8..94e32b743e 100644
 --- a/paddle/phi/backends/dynload/cupti.h
 +++ b/paddle/phi/backends/dynload/cupti.h
 @@ -24,8 +24,8 @@ limitations under the License. */
@@ -148,7 +135,7 @@ index 59e92955c9..d2f8c2da15 100644
  
  extern std::once_flag cupti_dso_flag;
  extern void *cupti_dso_handle;
-@@ -71,7 +71,7 @@ extern void *cupti_dso_handle;
+@@ -105,7 +105,7 @@ inline bool IsXPUTracingEnabled() {
  CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP);
  
  #undef DECLARE_DYNAMIC_LOAD_CUPTI_WRAP
@@ -191,7 +178,7 @@ index e8cb0ac643..e8e7596d44 100644
      }                                                                \
    };                                                                 \
 diff --git a/paddle/phi/backends/dynload/dynamic_loader.cc b/paddle/phi/backends/dynload/dynamic_loader.cc
-index c74ae9592e..f6dc68917c 100644
+index 39f50bd95d..4d627b99b7 100644
 --- a/paddle/phi/backends/dynload/dynamic_loader.cc
 +++ b/paddle/phi/backends/dynload/dynamic_loader.cc
 @@ -18,7 +18,6 @@ limitations under the License. */
@@ -229,7 +216,7 @@ index c5309e7e11..3328571380 100644
      }                                                              \
    };                                                               \
 diff --git a/paddle/phi/backends/gpu/cuda/cuda_device_function.h b/paddle/phi/backends/gpu/cuda/cuda_device_function.h
-index 092365a961..23d3b65dc6 100644
+index 092365a961..8bd3f9fcea 100644
 --- a/paddle/phi/backends/gpu/cuda/cuda_device_function.h
 +++ b/paddle/phi/backends/gpu/cuda/cuda_device_function.h
 @@ -1,3 +1,4 @@
@@ -347,7 +334,22 @@ index 092365a961..23d3b65dc6 100644
    CREATE_SHFL_MASK(mask, tid < len);
  
    for (int offset = warpSize / 2; offset > 0; offset /= 2)
-
+diff --git a/paddle/phi/common/float16.h b/paddle/phi/common/float16.h
+index d970878dc2..fe0382ccad 100644
+--- a/paddle/phi/common/float16.h
++++ b/paddle/phi/common/float16.h
+@@ -105,8 +105,9 @@ struct PADDLE_ALIGN(2) float16 {
+ #endif
+ 
+   HOSTDEVICE inline explicit float16(float val) {
+-#if defined(PADDLE_CUDA_FP16) && \
+-    (defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300))
++// #if defined(PADDLE_CUDA_FP16) && \
++//     (defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300))
++#if 1
+     half tmp = __float2half(val);
+     x = *reinterpret_cast<uint16_t*>(&tmp);
+ 
 diff --git a/paddle/phi/core/enforce.h b/paddle/phi/core/enforce.h
 index 024a7de73e..66b373d698 100644
 --- a/paddle/phi/core/enforce.h
@@ -651,7 +653,7 @@ index 461e6e2474..48a64ae9ce 100644
    dim3 threads(kWarpSize, kBlockDimY);
    dim3 grids(static_cast<int>((D + kWarpSize - 1) / kWarpSize));
 diff --git a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
-index 4eae698648..5c047723ea 100644
+index 470b0d33ee..d58838d53c 100644
 --- a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
 +++ b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
 @@ -43,11 +43,11 @@ template <typename T>
@@ -995,7 +997,7 @@ index 9d4bb18d55..80405c2b78 100644
          }
        }
 diff --git a/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu
-index acb3b83bc9..264d2a2b3e 100644
+index 6cf08a5ac7..c09018ba78 100644
 --- a/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu
 +++ b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu
 @@ -15,7 +15,7 @@
@@ -1008,7 +1010,7 @@ index acb3b83bc9..264d2a2b3e 100644
  namespace phi {
  namespace fusion {
 diff --git a/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu b/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu
-index b2d15a59f8..f64582e85a 100644
+index 1e7869afec..26ac439fc7 100644
 --- a/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu
 +++ b/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu
 @@ -15,7 +15,7 @@
@@ -1021,7 +1023,7 @@ index b2d15a59f8..f64582e85a 100644
  namespace phi {
  namespace fusion {
 diff --git a/paddle/phi/kernels/gpu/depthwise_conv.h b/paddle/phi/kernels/gpu/depthwise_conv.h
-index 2edac5eba5..4f265e3db7 100644
+index 770a3e1296..b0ec1b949b 100644
 --- a/paddle/phi/kernels/gpu/depthwise_conv.h
 +++ b/paddle/phi/kernels/gpu/depthwise_conv.h
 @@ -29,8 +29,8 @@ namespace cub = hipcub;

From 6a7f7e2cfe77468fecf522f22c919158e3668510 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Mon, 24 Nov 2025 10:28:06 +0800
Subject: [PATCH 115/121] ignore bilinear_interp_v2_op (#181)

---
 backends/metax_gpu/tests/ignore.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/backends/metax_gpu/tests/ignore.txt b/backends/metax_gpu/tests/ignore.txt
index 215280b8cb8..b1391194d7f 100644
--- a/backends/metax_gpu/tests/ignore.txt
+++ b/backends/metax_gpu/tests/ignore.txt
@@ -12,3 +12,4 @@ test_conv3d_transpose_op
 test_conv3d_layer
 test_conv3d_transpose_part2_op
 test_fused_conv2d_add_act_op
+test_bilinear_interp_v2_op

From 49cc5753a2be008fd96f2aabf654f5e8e701a5d6 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Mon, 24 Nov 2025 13:40:09 +0800
Subject: [PATCH 116/121] change yaml-yml (#182)

---
 .github/workflows/_Metax-X86.yaml |  1 -
 .github/workflows/_Metax-X86.yml  | 97 +++++++++++++++++++++++++++++++
 2 files changed, 97 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/_Metax-X86.yml

diff --git a/.github/workflows/_Metax-X86.yaml b/.github/workflows/_Metax-X86.yaml
index df8b320010a..9531f46fa9c 100644
--- a/.github/workflows/_Metax-X86.yaml
+++ b/.github/workflows/_Metax-X86.yaml
@@ -31,7 +31,6 @@ jobs:
       - name: Checkout repository
         run: |
           set -x
-
           wget -q --tries=5 --no-proxy https://paddle-github-action.bj.bcebos.com/PaddleCustomDevice/PR/${PR_ID}/${COMMIT_ID}/PaddleCustomDevice.tar.gz --no-check-certificate
           echo "Extracting PaddleCustomDevice.tar.gz"
           tar -xf PaddleCustomDevice.tar.gz
diff --git a/.github/workflows/_Metax-X86.yml b/.github/workflows/_Metax-X86.yml
new file mode 100644
index 00000000000..9531f46fa9c
--- /dev/null
+++ b/.github/workflows/_Metax-X86.yml
@@ -0,0 +1,97 @@
+name: paddle metax gpu test
+
+on:
+  workflow_call:
+    inputs:
+      workflow-name:
+        type: string
+        required: false
+      clone_dir:
+        type: string
+        required: false
+        default: 'PaddlecustomDevice'
+      is_pr:
+        type: string
+        required: false
+        default: 'true'
+
+defaults:
+  run:
+    shell: bash
+
+jobs:
+  metax-gpu-test:
+    env:
+      PR_ID: ${{ github.event.pull_request.number }}
+      COMMIT_ID: ${{ github.event.pull_request.head.sha }}
+      BRANCH: develop
+      runs-on: paddle-metax-runner-set
+      # runs-on: debug-paddle-runner-set
+    steps:
+      - name: Checkout repository
+        run: |
+          set -x
+          wget -q --tries=5 --no-proxy https://paddle-github-action.bj.bcebos.com/PaddleCustomDevice/PR/${PR_ID}/${COMMIT_ID}/PaddleCustomDevice.tar.gz --no-check-certificate
+          echo "Extracting PaddleCustomDevice.tar.gz"
+          tar -xf PaddleCustomDevice.tar.gz
+          cd PaddleCustomDevice
+          git config --global --add safe.directory "*"
+          git remote add upstream https://github.com/PaddlePaddle/PaddleCustomDevice.git
+          git merge ${BRANCH} --no-edit
+          git --no-pager log --pretty=oneline -5'
+
+      - name: Check bypass
+        id: check-bypass
+        uses: ./PaddleCustomDevice/.github/actions/check-bypass
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          workflow-name: metax
+
+      - name: RUN METAX-GPU
+        id: run-metax
+        if: steps.check-bypass.outputs.can-skip != 'true'
+        run: |
+          cd PaddleCustomDevice
+          # !!!!! SKIP IF NO METAX CHANGE !!!!
+          echo "=========== Checking PR Changes If METAX FULL CI Needed ==========="
+          change_numbers=$(git diff --name-only remotes/origin/${BRANCH} | wc -l)
+          change_backend=$(git diff --name-only remotes/origin/${BRANCH} | grep "backends/"| wc -l)
+          change_metax_only=$(git diff --name-only remotes/origin/${BRANCH} | grep "backends/metax_gpu"| wc -l)
+          git --no-pager diff --name-only remotes/origin/${BRANCH}
+
+          if [ $change_numbers -ne $change_backend ]; then
+            echo "Common file changed, continue to run METAX FULL CI test ..."
+            echo "should_skip=false" >> $GITHUB_OUTPUT
+          elif [ $change_metax_only -eq 0 ] ; then
+            echo "NO METAX backend changes found, skip METAX FULL CI ...."
+            echo "should_skip=true" >> $GITHUB_OUTPUT
+            exit 0
+          else
+            echo "should_skip=false" >> $GITHUB_OUTPUT
+          fi
+
+      - name: compile
+        run: |
+          cd PaddleCustomDevice/backends/metax_gpu
+          bash build.sh
+
+      - name: run test
+
+        run: |
+          cd PaddleCustomDevice/backends/metax_gpu/tests
+          bash run_test.sh -j 8
+
+      - name: push whl
+        env:
+          PR_ID: ${{ github.event.pull_request.number }}
+          COMMIT_ID: ${{ github.event.pull_request.head.sha }}
+        run: |
+          pip install bce-python-sdk==0.8.74
+          export AK=paddle
+          export SK=paddle
+          if [ ! -f "BosClient.py}" ]; then
+            wget -q --no-proxy https://xly-devops.bj.bcebos.com/home/bos_retry.tar.gz --no-check-certificate
+            tar xf bos_retry.tar.gz
+          fi
+          cp PaddleCustomDevice/backends/metax_gpu/build/dist/paddle_metax_gpu*.whl .
+          python BosClient.py paddle_metax_gpu*.whl paddle-github-action/PaddleCustomDevice/metax_gpu/${PR_ID}/${COMMIT_ID}

From 57305a5ba54ae47630661bff56bd5148aa68a637 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Mon, 24 Nov 2025 14:11:14 +0800
Subject: [PATCH 117/121] test (#183)

---
 .github/workflows/CI.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index b8c8106d198..e2b4d4820a0 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -49,7 +49,7 @@ jobs:
 
   metax:
     name: metax
-    uses: ./.github/workflows/_Metax-X86.yaml
+    uses: ./.github/workflows/_Metax-X86.yml
     needs: [Codestyle-Check]
 
   #sdaa:

From 7b323161152651686be2a3ae288812bab2686989 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Mon, 24 Nov 2025 15:01:19 +0800
Subject: [PATCH 118/121] rm metax ci (#184)

---
 .github/workflows/CI.yml         |  1 -
 .github/workflows/_Metax-X86.yml | 17 +++++++++++------
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index e2b4d4820a0..5d3832a35d7 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -51,7 +51,6 @@ jobs:
     name: metax
     uses: ./.github/workflows/_Metax-X86.yml
     needs: [Codestyle-Check]
-
   #sdaa:
     #name: sdaa
     #uses: ./.github/workflows/_SDAA.yml
diff --git a/.github/workflows/_Metax-X86.yml b/.github/workflows/_Metax-X86.yml
index 9531f46fa9c..2ff64cae36a 100644
--- a/.github/workflows/_Metax-X86.yml
+++ b/.github/workflows/_Metax-X86.yml
@@ -1,4 +1,5 @@
-name: paddle metax gpu test
+name: PR-CI-METAX
+
 
 on:
   workflow_call:
@@ -15,18 +16,21 @@ on:
         required: false
         default: 'true'
 
+
 defaults:
   run:
     shell: bash
 
+
 jobs:
   metax-gpu-test:
+    runs-on: paddle-metax-runner-set
     env:
       PR_ID: ${{ github.event.pull_request.number }}
       COMMIT_ID: ${{ github.event.pull_request.head.sha }}
       BRANCH: develop
-      runs-on: paddle-metax-runner-set
-      # runs-on: debug-paddle-runner-set
+
+
     steps:
       - name: Checkout repository
         run: |
@@ -38,7 +42,7 @@ jobs:
           git config --global --add safe.directory "*"
           git remote add upstream https://github.com/PaddlePaddle/PaddleCustomDevice.git
           git merge ${BRANCH} --no-edit
-          git --no-pager log --pretty=oneline -5'
+          git --no-pager log --pretty=oneline -5
 
       - name: Check bypass
         id: check-bypass
@@ -47,6 +51,7 @@ jobs:
           github-token: ${{ secrets.GITHUB_TOKEN }}
           workflow-name: metax
 
+
       - name: RUN METAX-GPU
         id: run-metax
         if: steps.check-bypass.outputs.can-skip != 'true'
@@ -54,11 +59,12 @@ jobs:
           cd PaddleCustomDevice
           # !!!!! SKIP IF NO METAX CHANGE !!!!
           echo "=========== Checking PR Changes If METAX FULL CI Needed ==========="
+
           change_numbers=$(git diff --name-only remotes/origin/${BRANCH} | wc -l)
+
           change_backend=$(git diff --name-only remotes/origin/${BRANCH} | grep "backends/"| wc -l)
           change_metax_only=$(git diff --name-only remotes/origin/${BRANCH} | grep "backends/metax_gpu"| wc -l)
           git --no-pager diff --name-only remotes/origin/${BRANCH}
-
           if [ $change_numbers -ne $change_backend ]; then
             echo "Common file changed, continue to run METAX FULL CI test ..."
             echo "should_skip=false" >> $GITHUB_OUTPUT
@@ -76,7 +82,6 @@ jobs:
           bash build.sh
 
       - name: run test
-
         run: |
           cd PaddleCustomDevice/backends/metax_gpu/tests
           bash run_test.sh -j 8

From 2dd81b8d822ab84cc877cea2f3e20ba1d99c9206 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Mon, 24 Nov 2025 16:00:18 +0800
Subject: [PATCH 119/121] updata paddle (#185)

---
 .github/workflows/_Metax-X86.yaml | 97 -------------------------------
 Paddle                            |  2 +-
 2 files changed, 1 insertion(+), 98 deletions(-)
 delete mode 100644 .github/workflows/_Metax-X86.yaml

diff --git a/.github/workflows/_Metax-X86.yaml b/.github/workflows/_Metax-X86.yaml
deleted file mode 100644
index 9531f46fa9c..00000000000
--- a/.github/workflows/_Metax-X86.yaml
+++ /dev/null
@@ -1,97 +0,0 @@
-name: paddle metax gpu test
-
-on:
-  workflow_call:
-    inputs:
-      workflow-name:
-        type: string
-        required: false
-      clone_dir:
-        type: string
-        required: false
-        default: 'PaddlecustomDevice'
-      is_pr:
-        type: string
-        required: false
-        default: 'true'
-
-defaults:
-  run:
-    shell: bash
-
-jobs:
-  metax-gpu-test:
-    env:
-      PR_ID: ${{ github.event.pull_request.number }}
-      COMMIT_ID: ${{ github.event.pull_request.head.sha }}
-      BRANCH: develop
-      runs-on: paddle-metax-runner-set
-      # runs-on: debug-paddle-runner-set
-    steps:
-      - name: Checkout repository
-        run: |
-          set -x
-          wget -q --tries=5 --no-proxy https://paddle-github-action.bj.bcebos.com/PaddleCustomDevice/PR/${PR_ID}/${COMMIT_ID}/PaddleCustomDevice.tar.gz --no-check-certificate
-          echo "Extracting PaddleCustomDevice.tar.gz"
-          tar -xf PaddleCustomDevice.tar.gz
-          cd PaddleCustomDevice
-          git config --global --add safe.directory "*"
-          git remote add upstream https://github.com/PaddlePaddle/PaddleCustomDevice.git
-          git merge ${BRANCH} --no-edit
-          git --no-pager log --pretty=oneline -5'
-
-      - name: Check bypass
-        id: check-bypass
-        uses: ./PaddleCustomDevice/.github/actions/check-bypass
-        with:
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-          workflow-name: metax
-
-      - name: RUN METAX-GPU
-        id: run-metax
-        if: steps.check-bypass.outputs.can-skip != 'true'
-        run: |
-          cd PaddleCustomDevice
-          # !!!!! SKIP IF NO METAX CHANGE !!!!
-          echo "=========== Checking PR Changes If METAX FULL CI Needed ==========="
-          change_numbers=$(git diff --name-only remotes/origin/${BRANCH} | wc -l)
-          change_backend=$(git diff --name-only remotes/origin/${BRANCH} | grep "backends/"| wc -l)
-          change_metax_only=$(git diff --name-only remotes/origin/${BRANCH} | grep "backends/metax_gpu"| wc -l)
-          git --no-pager diff --name-only remotes/origin/${BRANCH}
-
-          if [ $change_numbers -ne $change_backend ]; then
-            echo "Common file changed, continue to run METAX FULL CI test ..."
-            echo "should_skip=false" >> $GITHUB_OUTPUT
-          elif [ $change_metax_only -eq 0 ] ; then
-            echo "NO METAX backend changes found, skip METAX FULL CI ...."
-            echo "should_skip=true" >> $GITHUB_OUTPUT
-            exit 0
-          else
-            echo "should_skip=false" >> $GITHUB_OUTPUT
-          fi
-
-      - name: compile
-        run: |
-          cd PaddleCustomDevice/backends/metax_gpu
-          bash build.sh
-
-      - name: run test
-
-        run: |
-          cd PaddleCustomDevice/backends/metax_gpu/tests
-          bash run_test.sh -j 8
-
-      - name: push whl
-        env:
-          PR_ID: ${{ github.event.pull_request.number }}
-          COMMIT_ID: ${{ github.event.pull_request.head.sha }}
-        run: |
-          pip install bce-python-sdk==0.8.74
-          export AK=paddle
-          export SK=paddle
-          if [ ! -f "BosClient.py}" ]; then
-            wget -q --no-proxy https://xly-devops.bj.bcebos.com/home/bos_retry.tar.gz --no-check-certificate
-            tar xf bos_retry.tar.gz
-          fi
-          cp PaddleCustomDevice/backends/metax_gpu/build/dist/paddle_metax_gpu*.whl .
-          python BosClient.py paddle_metax_gpu*.whl paddle-github-action/PaddleCustomDevice/metax_gpu/${PR_ID}/${COMMIT_ID}
diff --git a/Paddle b/Paddle
index db736a01176..530cae468b7 160000
--- a/Paddle
+++ b/Paddle
@@ -1 +1 @@
-Subproject commit db736a011768c9b112723d60726f0b14d2c5e4e2
+Subproject commit 530cae468b772f904c21e544791b2b89cdf48b8e

From 865c17eed61c9119faaeff899891b552d52f18b7 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Mon, 24 Nov 2025 16:26:26 +0800
Subject: [PATCH 120/121] updata_paddle (#186)


From 32d26a27d59ef4ba5dc45e02b96b4c1a136241f0 Mon Sep 17 00:00:00 2001
From: metax666 <metax666@metax-tech.com>
Date: Mon, 24 Nov 2025 16:43:53 +0800
Subject: [PATCH 121/121] tets

---
 Paddle | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Paddle b/Paddle
index 530cae468b7..43f16a629f5 160000
--- a/Paddle
+++ b/Paddle
@@ -1 +1 @@
-Subproject commit 530cae468b772f904c21e544791b2b89cdf48b8e
+Subproject commit 43f16a629f5b4653fa879ba2635c32262f37331e