PaddlePaddle
diff --git a/‎paddle/phi/api/yaml/ops.yaml‎
Lines changed: 0 additions & 10 deletions b/‎paddle/phi/api/yaml/ops.yaml‎
Lines changed: 0 additions & 10 deletions
diff --git a/‎paddle/phi/infermeta/multiary.cc‎
Lines changed: 0 additions & 36 deletions b/‎paddle/phi/infermeta/multiary.cc‎
Lines changed: 0 additions & 36 deletions
diff --git a/‎paddle/phi/infermeta/multiary.h‎
Lines changed: 0 additions & 8 deletions b/‎paddle/phi/infermeta/multiary.h‎
Lines changed: 0 additions & 8 deletions
diff --git a/‎paddle/phi/kernels/fusion/cpu/rms_norm_avx_kernel.cc‎
Lines changed: 51 additions & 16 deletions b/‎paddle/phi/kernels/fusion/cpu/rms_norm_avx_kernel.cc‎
Lines changed: 51 additions & 16 deletions
@@ -2348,16 +2348,6 @@
   intermediate : inv_var
   backward : rms_norm_grad
 
-- op : rms_norm_avx
-  args : (Tensor x, Tensor residual, Tensor norm_weight, float epsilon, int begin_norm_axis)
-  output : Tensor(out),Tensor(residual_out)
-  infer_meta :
-    func : RmsNormAvxInferMeta
-  kernel :
-    func : rms_norm_avx
-    data_type : x
-  optional : residual,residual_out
-
 - op : rmsprop_
   args : (Tensor param, Tensor mean_square, Tensor grad, Tensor moment, Tensor learning_rate, Tensor mean_grad, Tensor master_param, float epsilon = 1.0e-10f, float decay = 0.9f, float momentum = 0.0f, bool centered = false, bool multi_precision = false)
   output : Tensor(param_out), Tensor(moment_out), Tensor(mean_square_out), Tensor(mean_grad_out), Tensor(master_param_outs)
 
@@ -3523,42 +3523,6 @@ void QuantizeLinearInferMeta(const MetaTensor& x,
   }
 }
 
-void RmsNormAvxInferMeta(const MetaTensor& x,
-                         const MetaTensor& residual,
-                         const MetaTensor& norm_weight,
-                         const float epsilon,
-                         const int begin_norm_axis,
-                         MetaTensor* out,
-                         MetaTensor* residual_out) {
-  std::vector<int64_t> x_dims_vec = common::vectorize(x.dims());
-  auto x_dims_size = x_dims_vec.size();
-
-  size_t normalized_dims = 1;
-  for (size_t i = begin_norm_axis; i < x_dims_size; ++i) {
-    normalized_dims *= x_dims_vec[i];
-  }
-  PADDLE_ENFORCE_EQ(normalized_dims,
-                    norm_weight.dims()[0],
-                    phi::errors::InvalidArgument(
-                        "The normalized size of Input(X) must equal to be"
-                        "the size of Weight, but received"
-                        "normalized size of Input(X) is [%d], received size"
-                        "of Weight is [%d]",
-                        normalized_dims,
-                        norm_weight.dims()[0]));
-
-  auto out_dims = common::make_ddim(x_dims_vec);
-  out->set_dims(out_dims);
-  out->set_dtype(x.dtype());
-  out->set_layout(x.layout());
-  out->share_lod(x);
-
-  residual_out->set_dims(out_dims);
-  residual_out->set_dtype(x.dtype());
-  residual_out->set_layout(x.layout());
-  residual_out->share_lod(x);
-}
-
 void RmsNormInferMeta(const MetaTensor& x,
                       const MetaTensor& bias,
                       const MetaTensor& residual,
 
@@ -951,14 +951,6 @@ void MaskedMultiheadAttentionInferMeta(const MetaTensor& x,
                                        MetaTensor* cache_kv_out,
                                        MetaTensor* beam_cache_offset_out);
 
-void RmsNormAvxInferMeta(const MetaTensor& x,
-                         const MetaTensor& residual,
-                         const MetaTensor& norm_weight,
-                         const float epsilon,
-                         const int begin_norm_axis,
-                         MetaTensor* out,
-                         MetaTensor* residual_out);
-
 void FullWithTensorInferMeta(const MetaTensor& shape,
                              DataType dtype,
                              MetaTensor* out);
 
@@ -1,5 +1,4 @@
-// Copyright (c) 2024 PaddlePaddle Authors And Intel Corporation.
-// All Rights Reserved.
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -29,18 +28,26 @@ namespace phi {
 namespace fusion {
 
 template <typename T, typename Context>
-void RmsNormKernel(const Context& dev_ctx,
-                   const DenseTensor& x,
-                   const paddle::optional<DenseTensor>& residual,
-                   const DenseTensor& norm_weight,
-                   const float epsilon,
-                   const int begin_norm_axis,
-                   DenseTensor* out,
-                   DenseTensor* residual_out) {
+void RmsNormAvxKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const paddle::optional<DenseTensor>& bias,
+                      const paddle::optional<DenseTensor>& residual,
+                      const DenseTensor& norm_weight,
+                      const paddle::optional<DenseTensor>& norm_bias,
+                      const float epsilon,
+                      const int begin_norm_axis,
+                      const float quant_scale,
+                      const int quant_round_type,
+                      const float quant_max_bound,
+                      const float quant_min_bound,
+                      DenseTensor* out,
+                      DenseTensor* residual_out,
+                      DenseTensor* inv_var) {
+  if (quant_scale > 0.0f) {
+    PD_THROW("NOT supported quant int8. ");
+  }
+
   const T* x_data = x.data<T>();
-  T* out_data = dev_ctx.template Alloc<T>(out);
-  const T* norm_weight_data = norm_weight.data<T>();
-  // x(batch_size,seq_len,hidden_size)
   int32_t rows = 1;
   int32_t cols = 1;
   for (int i = 0; i < begin_norm_axis; i++) {
@@ -53,10 +60,16 @@ void RmsNormKernel(const Context& dev_ctx,
   int size = cols;
   auto istride = cols;
   auto ostride = cols;
+  const T* norm_weight_data = norm_weight.data<T>();
+  const T* norm_bias_data = norm_bias ? norm_bias.get().data<T>() : nullptr;
   const T* residual_data = residual ? residual.get().data<T>() : nullptr;
+  const T* bias_data = bias ? bias.get().data<T>() : nullptr;
+  T* out_data = dev_ctx.template Alloc<T>(out);
   T* residual_out_data =
       residual ? dev_ctx.template Alloc<T>(residual_out) : nullptr;
 
+  __m512 vb = _mm512_setzero_ps();
+  const T* pb = bias_data;
 #ifdef PADDLE_WITH_MKLML
 #pragma omp parallel for
 #endif
@@ -77,6 +90,10 @@ void RmsNormKernel(const Context& dev_ctx,
       if (residual) {
         __m512 residual_vx = _mm512_loadu_ps(pr + col);
         vx = _mm512_add_ps(vx, residual_vx);
+        if (bias) {
+          __m512 vb = _mm512_loadu_ps(pb + col);
+          vx = _mm512_add_ps(vx, vb);
+        }
         _mm512_storeu_ps(pr_out + col, vx);
       }
       __m512 tmp = _mm512_mul_ps(vx, vx);
@@ -88,6 +105,10 @@ void RmsNormKernel(const Context& dev_ctx,
       if (residual) {
         __m512 residual_vx = _mm512_maskz_loadu_ps(mask, pr + col);
         vx = _mm512_mask_add_ps(vx, mask, vx, residual_vx);
+        if (bias) {
+          __m512 vb = _mm512_maskz_loadu_ps(mask, pb + col);
+          vx = _mm512_mask_add_ps(vx, mask, vx, vb);
+        }
         _mm512_mask_storeu_ps(pr_out + col, mask, vx);
       }
       __m512 tmp = _mm512_mul_ps(vx, vx);
@@ -105,9 +126,16 @@ void RmsNormKernel(const Context& dev_ctx,
       if (residual) {
         __m512 residual_vx = _mm512_loadu_ps(pr + col);
         vx = _mm512_add_ps(vx, residual_vx);
+        if (bias) {
+          __m512 vb = _mm512_loadu_ps(pb + col);
+          vx = _mm512_add_ps(vx, vb);
+        }
       }
       __m512 vw = _mm512_loadu_ps(norm_weight_data + col);
-      __m512 vy = vx * vvar * vw;
+      if (norm_bias_data) {
+        vb = _mm512_loadu_ps(norm_bias_data + col);
+      }
+      __m512 vy = vx * vvar * vw + vb;
       _mm512_storeu_ps(py + col, vy);
     }
     if (col < size) {
@@ -116,9 +144,16 @@ void RmsNormKernel(const Context& dev_ctx,
       if (residual) {
         __m512 residual_vx = _mm512_maskz_loadu_ps(mask, pr + col);
         vx = _mm512_mask_add_ps(vx, mask, vx, residual_vx);
+        if (bias) {
+          __m512 vb = _mm512_maskz_loadu_ps(mask, pb + col);
+          vx = _mm512_mask_add_ps(vx, mask, vx, vb);
+        }
       }
       __m512 vw = _mm512_maskz_loadu_ps(mask, norm_weight_data + col);
-      __m512 vy = vx * vvar * vw;
+      if (norm_bias_data) {
+        vb = _mm512_maskz_loadu_ps(mask, norm_bias_data + col);
+      }
+      __m512 vy = vx * vvar * vw + vb;
       _mm512_mask_storeu_ps(py + col, mask, vy);
     }
   }  // end for rows
@@ -127,4 +162,4 @@ void RmsNormKernel(const Context& dev_ctx,
 }  // namespace phi
 
 PD_REGISTER_KERNEL(
-    rms_norm_avx, CPU, ALL_LAYOUT, phi::fusion::RmsNormKernel, float, double) {}
+    rms_norm, CPU, ALL_LAYOUT, phi::fusion::RmsNormAvxKernel, float, double) {}