update

bukejiyu · bukejiyu · commit 1445637c1e59 · 2024-04-02T13:22:34.000Z
diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt
@@ -120,9 +120,7 @@ if(WITH_AVX
    AND WITH_MKL)
   set_source_files_properties(
     kernels/fusion/cpu/self_dp_attention_kernel.cc
-    PROPERTIES COMPILE_FLAGS "-Wno-maybe-uninitialized  -mfma ${AVX512F_FLAG}")
-  set_source_files_properties(
-    kernels/fusion/cpu/rms_norm_xft_kernel.cc
+    kernels/fusion/cpu/rms_norm_avx_kernel.cc
     PROPERTIES COMPILE_FLAGS "-Wno-maybe-uninitialized  -mfma ${AVX512F_FLAG}")
 endif()
 
diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
@@ -2348,13 +2348,13 @@
   intermediate : inv_var
   backward : rms_norm_grad
 
-- op : rms_norm_xft
+- op : rms_norm_avx
   args : (Tensor x, Tensor residual, Tensor norm_weight, float epsilon, int begin_norm_axis)
   output : Tensor(out),Tensor(residual_out)
   infer_meta :
-    func : RmsNormXftInferMeta
+    func : RmsNormAvxInferMeta
   kernel :
-    func : rms_norm_xft
+    func : rms_norm_avx
     data_type : x
   optional : residual,residual_out
 
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
@@ -3523,7 +3523,7 @@ void QuantizeLinearInferMeta(const MetaTensor& x,
   }
 }
 
-void RmsNormXftInferMeta(const MetaTensor& x,
+void RmsNormAvxInferMeta(const MetaTensor& x,
                          const MetaTensor& residual,
                          const MetaTensor& norm_weight,
                          const float epsilon,
diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h
@@ -951,7 +951,7 @@ void MaskedMultiheadAttentionInferMeta(const MetaTensor& x,
                                        MetaTensor* cache_kv_out,
                                        MetaTensor* beam_cache_offset_out);
 
-void RmsNormXftInferMeta(const MetaTensor& x,
+void RmsNormAvxInferMeta(const MetaTensor& x,
                          const MetaTensor& residual,
                          const MetaTensor& norm_weight,
                          const float epsilon,
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
@@ -263,7 +263,7 @@ if(NOT
     AND AVX512F_FLAG
     AND WITH_MKL))
   list(REMOVE_ITEM kernel_cc "fusion/cpu/self_dp_attention_kernel.cc")
-  list(REMOVE_ITEM kernel_cc "fusion/cpu/rms_norm_xft_kernel.cc")
+  list(REMOVE_ITEM kernel_cc "fusion/cpu/rms_norm_avx_kernel.cc")
 endif()
 
 file(
diff --git a/paddle/phi/kernels/fusion/cpu/rms_norm_avx_kernel.cc b/paddle/phi/kernels/fusion/cpu/rms_norm_avx_kernel.cc
@@ -1,13 +1,18 @@
-/* Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
+// Copyright (c) 2024 PaddlePaddle Authors And Intel Corporation.
+// All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #include <immintrin.h>
 #include <math.h>
 #include <omp.h>
@@ -24,15 +29,15 @@ namespace phi {
 namespace fusion {
 
 template <typename T, typename Context>
-void RmsNormXftKernel(const Context& dev_ctx,
-                      const DenseTensor& x,
-                      const paddle::optional<DenseTensor>& residual,
-                      const DenseTensor& norm_weight,
-                      const float epsilon,
-                      const int begin_norm_axis,
-                      DenseTensor* out,
-                      DenseTensor* residual_out) {
-  const float* x_data = x.data<float>();
+void RmsNormKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const paddle::optional<DenseTensor>& residual,
+                   const DenseTensor& norm_weight,
+                   const float epsilon,
+                   const int begin_norm_axis,
+                   DenseTensor* out,
+                   DenseTensor* residual_out) {
+  const T* x_data = x.data<T>();
   T* out_data = dev_ctx.template Alloc<T>(out);
   const T* norm_weight_data = norm_weight.data<T>();
   // x(batch_size,seq_len,hidden_size)
@@ -61,7 +66,7 @@ void RmsNormXftKernel(const Context& dev_ctx,
     T* pr_out = residual ? residual_out_data + r * ostride : nullptr;
     T* py = out_data + r * ostride;
 
-    float squareSum = 0;
+    T squareSum = 0;
 
     __m512 vsqare = _mm512_set1_ps(0);
 
@@ -92,7 +97,7 @@ void RmsNormXftKernel(const Context& dev_ctx,
     squareSum = _mm512_reduce_add_ps(vsqare);
 
     // Variance
-    float var = 1 / sqrt(squareSum / size + epsilon);
+    T var = 1 / sqrt(squareSum / size + epsilon);
     __m512 vvar = _mm512_set1_ps(var);
 
     for (col = 0; col + 15 < size; col += 16) {
@@ -122,4 +127,4 @@ void RmsNormXftKernel(const Context& dev_ctx,
 }  // namespace phi
 
 PD_REGISTER_KERNEL(
-    rms_norm_xft, CPU, ALL_LAYOUT, phi::fusion::RmsNormXftKernel, float) {}
+    rms_norm_avx, CPU, ALL_LAYOUT, phi::fusion::RmsNormKernel, float, double) {}
diff --git a/test/mkldnn/CMakeLists.txt b/test/mkldnn/CMakeLists.txt
@@ -11,6 +11,15 @@ if(WITH_MKLDNN AND NOT WIN32)
   list(APPEND TEST_OPS "test_onnx_format_quantization_mobilenetv1")
 endif()
 
+if(NOT
+   (
+   (WITH_AVX
+    AND AVX512F_FOUND
+    AND AVX512F_FLAG
+    AND WITH_MKL)))
+  list(REMOVE_ITEM TEST_OPS "test_rms_norm_avx_op")
+endif()
+
 foreach(TEST_OP ${TEST_OPS})
   py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach()
diff --git a/test/mkldnn/test_rms_norm_avx_op.py b/test/mkldnn/test_rms_norm_avx_op.py
@@ -24,48 +24,44 @@
 os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
 
 
-def rms_norm_xft(
+def rms_norm_avx(
     x,
     norm_weight,
     begin_norm_axis,
     residual=None,
     epsilon=1e-6,
 ):
     if paddle.in_dynamic_mode():
-        helper = LayerHelper('rms_norm_xft', **locals())
+        helper = LayerHelper('rms_norm_avx', **locals())
         attrs = (
             epsilon,
             begin_norm_axis,
         )
-        output = _C_ops.rms_norm_xft(x, residual, norm_weight, *attrs)
-        return output
+        output, residual = _C_ops.rms_norm_avx(x, residual, norm_weight, *attrs)
+        return output, residual
     else:
-        helper = LayerHelper("rms_norm_xft", **locals())
-        inputs = {
-            "x": [x],
-            "norm_weight": [norm_weight],
-        }
+        helper = LayerHelper("rms_norm_avx", **locals())
 
         output = helper.create_variable_for_type_inference(x.dtype)
+        residual_out = helper.create_variable_for_type_inference(x.dtype)
 
         inputs = {'x': x, 'residual': residual, 'norm_weight': norm_weight}
-        outputs = {'out': output}
+        outputs = {'out': output, 'residual_out': residual_out}
 
         helper.append_op(
-            type="rms_norm_xft",
+            type="rms_norm_avx",
             inputs=inputs,
             attrs={"epsilon": epsilon, "begin_norm_axis": begin_norm_axis},
             outputs=outputs,
         )
-        return output
+        return output, residual
 
 
 class RmsNormXFTTestCase(unittest.TestCase):
     def config(self):
         self.dtype = 'float32'
         self.rtol = 1e-5
         self.atol = 1e-5
-        self.bias = True
         self.bs = 1
         self.seq_len = 8
         self.hidden_size = 4096
@@ -90,11 +86,11 @@ def raw_rms_norm(self, x, norm_weight, residual=None, epsilon=1e-6):
         input_x = paddle.rsqrt(variance + epsilon) * input_x
         return input_x * norm_weight
 
-    def test_rms_norm_xft(self):
+    def test_rms_norm_avx(self):
         act_out = self.raw_rms_norm(
             self.x, self.weight, self.residual, self.epsilon
         )
-        xft_out = rms_norm_xft(
+        xft_out = rms_norm_avx(
             x=self.x,
             residual=self.residual,
             norm_weight=self.weight,
@@ -125,7 +121,7 @@ def setUp(self):
         )
         self.residual = None
 
-    def test_rms_norm_xft(self):
+    def test_rms_norm_avx(self):
         paddle.enable_static()
         exe = base.Executor(base.CPUPlace())
         main_program = base.Program()
@@ -139,22 +135,27 @@ def test_rms_norm_xft(self):
             norm_weight = paddle.static.data(
                 shape=[self.hidden_size], dtype='float32', name='norm_weight'
             )
-            xft_out = rms_norm_xft(
+            xft_out, residual_out = rms_norm_avx(
                 x=x,
                 residual=self.residual,
                 norm_weight=norm_weight,
                 epsilon=self.epsilon,
                 begin_norm_axis=self.begin_norm_axis,
             )
         exe.run(startup_program)
+        fetch_list = (
+            [xft_out.name]
+            if self.residual is None
+            else [xft_out.name, residual_out.name]
+        )
         xft_res = exe.run(
             main_program,
             feed={
                 'x': self.x.numpy(),
                 'norm_weight': self.weight.numpy(),
                 'residual': self.residual,
             },
-            fetch_list=[xft_out.name],
+            fetch_list=fetch_list,
         )
         paddle.disable_static()
         act_out = self.raw_rms_norm(

Original file line number	Diff line number	Diff line change
`@@ -3523,7 +3523,7 @@ void QuantizeLinearInferMeta(const MetaTensor& x,`
`3523`	`3523`	`}`
`3524`	`3524`	`}`
`3525`	`3525`
`3526`		`-void RmsNormXftInferMeta(const MetaTensor& x,`
	`3526`	`+void RmsNormAvxInferMeta(const MetaTensor& x,`
`3527`	`3527`	`const MetaTensor& residual,`
`3528`	`3528`	`const MetaTensor& norm_weight,`
`3529`	`3529`	`const float epsilon,`