Add CPU and NPU operator of c_embedding

Baibaifan · Baibaifan · commit 3b6e31ba1c36 · 2021-09-10T05:20:25.000Z
diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h
@@ -144,8 +144,7 @@ inline bool ContainsNan(const paddle::platform::NPUDeviceContext& dev_ctx,
   try {
     const auto& runner_mean = paddle::operators::NpuOpRunner(
         "ReduceMeanD", {*in}, {mean}, {{"axes", axes}, {"keep_dims", false}});
-    // FIXME(gongwb): not need to open this.
-    // runner_mean.Run(stream);
+    runner_mean.Run(stream);
     TensorToVector(mean, dev_ctx, &vec);
   } catch (...) {
     LOG(WARNING) << "ContainsNan catch exception";
@@ -241,7 +240,7 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
       case framework::proto::VarType::FP32: {
         if (FLAGS_hccl_check_nan) {
           VLOG(3) << "prepare to FoundNanInf";
-          ContainsNan(*dev_ctx, dev_ctx->stream(), in);
+          found_nan = ContainsNan(*dev_ctx, dev_ctx->stream(), in);
           VLOG(3) << "check_numerics:" << found_nan;
         }
         break;
diff --git a/paddle/fluid/operators/collective/c_embedding_op.cc b/paddle/fluid/operators/collective/c_embedding_op.cc
@@ -74,7 +74,7 @@ class CEmbeddingOpMaker : public framework::OpProtoAndCheckerMaker {
              "(Tensor) The input represents embedding tensors, "
              "which is a learnable parameter.");
     AddInput("Ids",
-             "An input with type int32 or int64"
+             "An input with type int32 or int64 in CPU and GPU, int32 in NPU "
              "contains the ids to be looked up in W.");
     AddOutput("Out", "The lookup results, which have the same type as W.");
 
@@ -126,16 +126,17 @@ class CEmbeddingOpGrad : public framework::OperatorWithKernel {
     // check valid
     PADDLE_ENFORCE_EQ(table_dims.size(), 2,
                       platform::errors::InvalidArgument(
-                          "npu only accept the dims of table_t == 2"));
+                          "Only accept the dims of table_t == 2"));
 
     const int64_t start_idx = ctx->Attrs().Get<int64_t>("start_index");
     const int64_t height = table_dims[0];
     const int64_t width = table_dims[1];
 
     PADDLE_ENFORCE_EQ(
-        (height >= 0 && width >= 0 && start_idx >= 0), true,
-        "height:%ld width:%ld start_idx:%ld must not have negtive values",
-        height, width, start_idx);
+        (height > 0 && width > 0 && start_idx >= 0), true,
+        platform::errors::InvalidArgument(
+            "height:%ld width:%ld start_idx:%ld must not have negtive values",
+            height, width, start_idx));
   }
 
  protected:
diff --git a/paddle/fluid/operators/collective/c_embedding_op.cu b/paddle/fluid/operators/collective/c_embedding_op.cu
@@ -107,7 +107,7 @@ class CEmbeddingCUDAKernel : public framework::OpKernel<T> {
           limit);
     } else {
       PADDLE_THROW(platform::errors::Unavailable(
-          "c_embedding ids only support int32 or int64."));
+          "GPU c_embedding ids only support int32 or int64."));
     }
   }
 };
diff --git a/paddle/fluid/operators/collective/c_embedding_op.h b/paddle/fluid/operators/collective/c_embedding_op.h
@@ -38,12 +38,13 @@ void GetIdsEmbedding(const TIds* ids, size_t ids_len, int64_t start_idx,
     int64_t local = id - start_idx;
 
     if (local >= 0 && local < height) {
-      /*
-      for (int64_t w = 0; w < width; w++) {
-        out[i * width + w] = table[local * width + w];
-      }
-      */
+      // for (int64_t w = 0; w < width; w++) {
+      //   out[i * width + w] = table[local * width + w];
+      // }
+
       memcpy(out + i * width, table + local * width, width * sizeof(TData));
+    } else {
+      memset(out + i * width, 0, width * sizeof(TData));
     }
   }
 }
@@ -74,7 +75,7 @@ class CEmbeddingOpCPUKernel : public framework::OpKernel<T> {
                       table_data, height, width, output_data);
     } else {
       PADDLE_THROW(platform::errors::Unavailable(
-          "c_embedding ids only support int32 or int64."));
+          "CPU c_embedding ids only support int32 or int64."));
     }
   }
 };
@@ -108,12 +109,17 @@ class CEmbeddingGradOpCPUKernel : public framework::OpKernel<T> {
     T* table_grad_data =
         table_grad_t->mutable_data<T>(table_t->dims(), context.GetPlace());
 
+    size_t table_t_mem_size =
+        table_t->numel() * framework::SizeOfType(table_grad_t->type());
+    size_t table_grad_t_mem_size =
+        table_grad_t->numel() * framework::SizeOfType(table_grad_t->type());
+
     VLOG(10) << "table_dims:" << table_t->dims()
-             << ", table_t memory_size:" << table_t->memory_size()
-             << ", table_grad_t memory_size:" << table_grad_t->memory_size()
+             << ", table_t memory_size:" << table_t_mem_size
+             << ", table_grad_t memory_size:" << table_grad_t_mem_size
              << ", start_index:" << start_idx;
 
-    memset(table_grad_data, 0, table_grad_t->memory_size());
+    memset(table_grad_data, 0, table_grad_t_mem_size);
     const T* d_output_data = d_output_t->data<T>();
 
     const int64_t height = table_t->dims()[0];
@@ -128,7 +134,7 @@ class CEmbeddingGradOpCPUKernel : public framework::OpKernel<T> {
                       table_grad_data, height, width, d_output_data);
     } else {
       PADDLE_THROW(platform::errors::Unavailable(
-          "c_embedding ids only support int32 or int64."));
+          "CPU c_embedding ids only support int32 or int64."));
     }
   }
 };
diff --git a/paddle/fluid/operators/collective/c_embedding_op_npu.cc b/paddle/fluid/operators/collective/c_embedding_op_npu.cc
@@ -113,10 +113,12 @@ void NPUGetIdsEmbedding(const framework::ExecutionContext &context) {
       framework::make_ddim({table_t->dims()[0] + 1, table_t->dims()[1]});
   framework::LoDTensor table_t_pad;
 
-  size_t mem_size = table_t->memory_size();
+  size_t mem_size = table_t->numel() * framework::SizeOfType(table_t->type());
   size_t line_mem_size =
       table_t->dims()[1] * framework::SizeOfType(table_t->type());
-  PADDLE_ENFORCE_EQ(line_mem_size % 64, 0, "must align by 64");
+  PADDLE_ENFORCE_EQ(line_mem_size % 64, 0,
+                    platform::errors::InvalidArgument(
+                        "NPU only accept the second dim must align by 64"));
 
   VLOG(10) << "mem_size:" << mem_size << ",line_mem_size:" << line_mem_size
            << ", pad_shape:" << pad_shape << ", table_dims:" << table_t->dims();
@@ -148,11 +150,9 @@ class CEmbeddingNPUKernel : public framework::OpKernel<T> {
     const auto &index_type = ids_t->type();
     if (index_type == framework::proto::VarType::INT32) {
       NPUGetIdsEmbedding<int32_t, T>(context);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      NPUGetIdsEmbedding<int64_t, T>(context);
     } else {
       PADDLE_THROW(platform::errors::Unavailable(
-          "c_embedding ids only support int32 or int64."));
+          "NPU c_embedding ids only support int32."));
     }
   }
 };
@@ -186,9 +186,10 @@ void NPUUpdateEmbedding(const framework::ExecutionContext &context) {
   // set table_t_pad to zero
   uint8_t *pad_data = reinterpret_cast<uint8_t *>(
       table_t_pad.mutable_data<T>(pad_shape, context.GetPlace()));
-  PADDLE_ENFORCE_NPU_SUCCESS(
-      aclrtMemsetAsync(pad_data, table_t_pad.memory_size(), 0,
-                       table_t_pad.memory_size(), stream));
+  size_t table_t_pad_mem_size =
+      table_t_pad.numel() * framework::SizeOfType(table_t_pad.type());
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtMemsetAsync(pad_data, table_t_pad_mem_size, 0,
+                                              table_t_pad_mem_size, stream));
 
   // NOTE(zhiqiu): It seems in cann 20.1, the first input and output
   // can be different tensor, but in cann 20.2+, it does inplace operation.
@@ -200,12 +201,15 @@ void NPUUpdateEmbedding(const framework::ExecutionContext &context) {
 
   // copy table_t_pad to table_t
   T *dst = table_grad_t->mutable_data<T>(table_t->dims(), context.GetPlace());
-  const size_t mem_size = table_grad_t->memory_size();
+  const size_t mem_size =
+      table_grad_t->numel() * framework::SizeOfType(table_grad_t->type());
 
   // check align
   size_t line_mem_size =
       table_grad_t->dims()[1] * framework::SizeOfType(table_grad_t->type());
-  PADDLE_ENFORCE_EQ(line_mem_size % 64, 0, "must align by 64");
+  PADDLE_ENFORCE_EQ(line_mem_size % 64, 0,
+                    platform::errors::InvalidArgument(
+                        "NPU only accept the second dim must align by 64"));
 
   PADDLE_ENFORCE_NPU_SUCCESS(aclrtMemcpyAsync(
       dst, mem_size, pad_data, mem_size, ACL_MEMCPY_DEVICE_TO_DEVICE, stream));
@@ -220,11 +224,9 @@ class CEmbeddingGradNPUKernel : public framework::OpKernel<T> {
     const auto &index_type = ids_t->type();
     if (index_type == framework::proto::VarType::INT32) {
       NPUUpdateEmbedding<int32_t, T>(context);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      NPUUpdateEmbedding<int64_t, T>(context);
     } else {
-      PADDLE_THROW(platform::errors::Unavailable(
-          "c_embedding ids only support int32 or int64."));
+      PADDLE_THROW(
+          platform::errors::Unavailable("c_embedding ids only support int32."));
     }
   }
 };
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
@@ -93,7 +93,7 @@ DEFINE_string(selected_npus, "",
               "This option is useful when doing multi process training and "
               "each process have only one device (NPU). If you want to use "
               "all visible devices, set this to empty string.");
-DEFINE_bool(hccl_check_nan, true,
+DEFINE_bool(hccl_check_nan, false,
             "Check Nan in tensor before hccl_allreduce_sum otherwise it'll "
             "core when meets Nan value");
 DEFINE_string(
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -91,6 +91,7 @@ if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32)
     LIST(REMOVE_ITEM TEST_OPS test_c_split)
     LIST(REMOVE_ITEM TEST_OPS test_allgather)
     LIST(REMOVE_ITEM TEST_OPS test_c_identity)
+    LIST(REMOVE_ITEM TEST_OPS test_c_embedding_op)
     LIST(REMOVE_ITEM TEST_OPS test_allreduce)
     LIST(REMOVE_ITEM TEST_OPS test_broadcast)
     LIST(REMOVE_ITEM TEST_OPS test_collective_reduce)
@@ -119,10 +120,6 @@ if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32)
     LIST(REMOVE_ITEM TEST_OPS test_disable_signal_handler)
 endif()
 
-if(((NOT WITH_ROCM) AND (NOT WITH_GPU) AND (NOT WITH_ASCEND_CL)) OR WIN32)
-    LIST(REMOVE_ITEM TEST_OPS test_c_embedding_op)
-endif()
-
 if(WIN32)
     LIST(REMOVE_ITEM TEST_OPS test_multiprocess_reader_exception)
     LIST(REMOVE_ITEM TEST_OPS test_trainer_desc)
diff --git a/python/paddle/fluid/tests/unittests/c_embedding_op_base.py b/python/paddle/fluid/tests/unittests/c_embedding_op_base.py
@@ -0,0 +1,132 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.framework import core
+
+SEED = 2021
+np.random.seed(SEED)
+
+
+def get_c_embedding(start, end, table, ids):
+    index = ids.flatten()
+    input_mask = (index < start) | (index >= end)
+    masked_input = index - start
+    masked_input[input_mask] = 0
+    output = table[masked_input]
+    output[input_mask] = 0.0
+    return output
+
+
+class TestCEmbeddingCPU(OpTest):
+    def setUp(self):
+        self.init_dtype()
+        self.initcase()
+        if core.is_compiled_with_npu():
+            self.__class__.use_npu = True
+        elif core.is_compiled_with_cuda():
+            self.__class__.exist_fp64_check_grad = True
+
+    def initcase(self):
+        self.op_type = "c_embedding"
+        table = np.random.random((17, 64)).astype(self.dtype)
+        ids = np.random.randint(
+            low=0, high=17 * 2, size=(2, 4)).astype(self.ids_dtype)
+        self.start_index = 10
+        self.end_index = self.start_index + 17
+
+        self.inputs = {'W': table, 'Ids': ids}
+        np_out = get_c_embedding(self.start_index, self.end_index, table, ids)
+        self.outputs = {'Out': np_out.reshape((2, 4, 64))}
+        self.attrs = {'start_index': self.start_index}
+        if core.is_compiled_with_npu():
+            self.__class__.use_npu = True
+
+    def test_check_cpu(self):
+        self.check_output_with_place(core.CPUPlace())
+
+    def test_check_cpu_grad(self):
+        self.check_grad_with_place(core.CPUPlace(), ['W'], 'Out')
+
+    def init_dtype(self):
+        self.dtype = "float32"
+        self.ids_dtype = "int64"
+
+
+class TestCEmbeddingOpBase(TestCEmbeddingCPU):
+    def setUp(self):
+        self.init_dtype()
+        self.initcase()
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            self.check_output_with_place(core.CUDAPlace(0))
+        elif core.is_compiled_with_npu():
+            self.check_output_with_place(core.NPUPlace(0))
+
+    def test_check_grad(self):
+        if core.is_compiled_with_cuda():
+            self.check_grad_with_place(core.CUDAPlace(0), ['W'], 'Out')
+        elif core.is_compiled_with_npu():
+            self.check_grad_with_place(core.NPUPlace(0), ['W'], 'Out')
+
+    def init_dtype(self):
+        if core.is_compiled_with_cuda():
+            self.dtype = "float64"
+            self.ids_dtype = "int64"
+        elif core.is_compiled_with_npu():
+            self.dtype = "float32"
+            self.ids_dtype = "int32"
+
+
+class TestCEmbeddingOpFP32(TestCEmbeddingOpBase):
+    def setUp(self):
+        self.init_dtype()
+        self.initcase()
+
+    def initcase(self):
+        self.op_type = "c_embedding"
+        table = np.random.random((17, 64)).astype(self.dtype)
+        ids = np.random.randint(
+            low=0, high=17 * 2, size=(2, 4)).astype(self.ids_dtype)
+        self.start_index = 10
+        ids[0][1] = 12
+        ids[0][2] = 12
+        ids[1][2] = 12
+        ids[1][3] = 12
+        self.end_index = self.start_index + 17
+
+        self.inputs = {'W': table, 'Ids': ids}
+        np_out = get_c_embedding(self.start_index, self.end_index, table, ids)
+        self.outputs = {'Out': np_out.reshape((2, 4, 64))}
+        self.attrs = {'start_index': self.start_index}
+
+        if core.is_compiled_with_npu():
+            self.__class__.use_npu = True
+        elif core.is_compiled_with_cuda():
+            self.__class__.exist_fp64_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = "float32"
+        self.ids_dtype = "int32"
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_c_embedding_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_c_embedding_op_npu.py
@@ -0,0 +1,36 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.tests.unittests.c_embedding_op_base import TestCEmbeddingCPU, TestCEmbeddingOpBase, TestCEmbeddingOpFP32
+
+paddle.enable_static()
+
+TestCEmbeddingCPU()
+
+TestCEmbeddingOpBase()
+
+TestCEmbeddingOpFP32()
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_c_embedding_op.py b/python/paddle/fluid/tests/unittests/test_c_embedding_op.py

Original file line number	Diff line number	Diff line change
`@@ -107,7 +107,7 @@ class CEmbeddingCUDAKernel : public framework::OpKernel<T> {`
`107`	`107`	`limit);`
`108`	`108`	`} else {`
`109`	`109`	`PADDLE_THROW(platform::errors::Unavailable(`
`110`		`- "c_embedding ids only support int32 or int64."));`
	`110`	`+ "GPU c_embedding ids only support int32 or int64."));`
`111`	`111`	`}`
`112`	`112`	`}`
`113`	`113`	`};`