error message opt for XPU, test=kunlun (#27972) (#28078)

LDOUBLEV · web-flow · commit 8600f47439db · 2020-10-19T20:34:39.000+08:00
* add stack pool2d roi_align xpu op,test=kunlun

* error message opt, test=kunlun

* add xpu unittest,test=kunlun

* skip check grad,test=kunlun

* fix boostget , test=kunlun

* error message opt for XPU, test=kunlun
diff --git a/paddle/fluid/operators/pool_op_xpu.cc b/paddle/fluid/operators/pool_op_xpu.cc
@@ -43,12 +43,14 @@ class PoolXPUKernel : public framework::OpKernel<T> {
     bool exclusive = context.Attr<bool>("exclusive");
     bool is_test = context.Attr<bool>("is_test");
     bool adaptive = context.Attr<bool>("adaptive");
-    PADDLE_ENFORCE_EQ(!adaptive, true,
-                      platform::errors::InvalidArgument(
-                          "XPU does not support adaptive == true!"));
-    PADDLE_ENFORCE_EQ(ksize.size(), 2,
-                      platform::errors::InvalidArgument(
-                          "XPU only support 2 dimension pooling!"));
+    PADDLE_ENFORCE_EQ(
+        !adaptive, true,
+        platform::errors::InvalidArgument(
+            "The Pool2d XPU OP does not support adaptive == true!"));
+    PADDLE_ENFORCE_EQ(
+        ksize.size(), 2,
+        platform::errors::InvalidArgument(
+            "The Pool2d XPU OP only support 2 dimension pooling!"));
     int* index_data = nullptr;
     if (context.Attr<bool>("global_pooling")) {
       for (size_t i = 0; i < ksize.size(); ++i) {
@@ -80,7 +82,10 @@ class PoolXPUKernel : public framework::OpKernel<T> {
         stride_w, out_h, out_w);
     PADDLE_ENFORCE_EQ(
         r, xpu::Error_t::SUCCESS,
-        platform::errors::InvalidArgument("pool2d XPU kernel error!"));
+        platform::errors::External(
+            "The pool2d XPU API return wrong value[%d], please check "
+            "where Baidu Kunlun Card is properly installed.",
+            r));
   }
 };
 template <typename DeviceContext, typename T>
@@ -99,12 +104,15 @@ class PoolGradXPUKernel : public framework::OpKernel<T> {
     bool exclusive = context.Attr<bool>("exclusive");
     bool adaptive = context.Attr<bool>("adaptive");
     const int* index_data = nullptr;
-    PADDLE_ENFORCE_EQ(!adaptive, true,
-                      platform::errors::InvalidArgument(
-                          "XPU does not support adaptive == true!"));
-    PADDLE_ENFORCE_EQ(ksize.size(), 2,
-                      platform::errors::InvalidArgument(
-                          "XPU only support 2 dimension pooling!"));
+    PADDLE_ENFORCE_EQ(
+        !adaptive, true,
+        platform::errors::InvalidArgument(
+            "The Pool2d XPU OP does not support adaptive == true!"));
+    PADDLE_ENFORCE_EQ(ksize.size(), 2, platform::errors::InvalidArgument(
+                                           "The Pool2d XPU OP only support 2 "
+                                           "dimension pooling!, but received "
+                                           "%d-dimension pool kernel size",
+                                           ksize.size()));
     if (context.Attr<bool>("global_pooling")) {
       for (size_t i = 0; i < ksize.size(); ++i) {
         paddings[i] = 0;
@@ -139,16 +147,22 @@ class PoolGradXPUKernel : public framework::OpKernel<T> {
     int r =
         xpu::memset(dev_ctx.x_context(), reinterpret_cast<void**>(input_grad),
                     zero, in_x_grad->numel() * sizeof(float));
-    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                      platform::errors::InvalidArgument(
-                          "There are pool2d grad XPU kernel error raised!"));
+    PADDLE_ENFORCE_EQ(
+        r, xpu::Error_t::SUCCESS,
+        platform::errors::External(
+            "The Pool2d XPU OP return wrong value[%d], please check "
+            "where Baidu Kunlun Card is properly installed.",
+            r));
     r = xpu::pooling_backward(dev_ctx.x_context(), input, output, index_data,
                               output_grad, input_grad, pool_type, c, in_h, in_w,
                               pad_left, pad_right, pad_up, pad_down, win_h,
                               win_w, stride_h, stride_w, out_h, out_w);
-    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                      platform::errors::InvalidArgument(
-                          "There are pool2d grad XPU kernel error raised!"));
+    PADDLE_ENFORCE_EQ(
+        r, xpu::Error_t::SUCCESS,
+        platform::errors::External(
+            "The Pool2d XPU OP return wrong value[%d], please check "
+            "where Baidu Kunlun Card is properly installed.",
+            r));
   }
 };
 
diff --git a/paddle/fluid/operators/roi_align_op_xpu.cc b/paddle/fluid/operators/roi_align_op_xpu.cc
@@ -44,11 +44,16 @@ class XPUROIAlignOpKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         rois_batch_size, batch_size,
         platform::errors::InvalidArgument(
-            "The rois_batch_size and imgs batch_size must be the same."));
+            "The rois_batch_size and imgs batch_size of roi_align_xpu OP must "
+            "be the same. But received rois_batch_size %d , batch_size %d",
+            rois_batch_size, batch_size));
     int rois_num_with_lod = rois_lod[rois_batch_size];
-    PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod,
-                      platform::errors::InvalidArgument(
-                          "The rois_num from input and lod must be the same."));
+    PADDLE_ENFORCE_EQ(
+        rois_num, rois_num_with_lod,
+        platform::errors::InvalidArgument(
+            "The rois_num from input and lod of roi_align_xpu OP must be the "
+            "same. But received input rois_num %d , input lod %d",
+            rois_num, rois_num_with_lod));
     T* output_data = out->mutable_data<T>(ctx.GetPlace());
     const T* rois_data = rois->data<T>();
     for (int n = 0; n < rois_batch_size; n++) {
@@ -62,7 +67,10 @@ class XPUROIAlignOpKernel : public framework::OpKernel<T> {
                 rois_lod[n] * channels * pooled_height * pooled_width);
         PADDLE_ENFORCE_EQ(
             r, xpu::Error_t::SUCCESS,
-            platform::errors::InvalidArgument("roi_align XPU kernel error!"));
+            platform::errors::External(
+                "The roi_align XPU OP return wrong value[%d], please check "
+                "where Baidu Kunlun Card is properly installed.",
+                r));
       }
     }
   }
diff --git a/paddle/fluid/operators/stack_op_xpu.cc b/paddle/fluid/operators/stack_op_xpu.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/stack_op.h"
+#include <string>
 #ifdef PADDLE_WITH_XPU
 
 namespace paddle {
@@ -45,8 +46,15 @@ class StackXPUKernel : public framework::OpKernel<T> {
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     void* x_datas_host = std::malloc(n * sizeof(void*));
     void* x_datas_device = nullptr;
-    PADDLE_ENFORCE(xpu_malloc(reinterpret_cast<void**>(&x_datas_device),
-                              n * sizeof(void*)) == XPU_SUCCESS);
+    PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast<void**>(&x_datas_device),
+                                 n * sizeof(void*)),
+                      XPU_SUCCESS,
+                      platform::errors::ResourceExhausted(
+                          "\n\nOut of memory error on XPU, Cannot"
+                          "allocate %s memory on XPU. \n\nPlease "
+                          "check whether there is any other process "
+                          "using XPU.\n",
+                          string::HumanReadableSize(n * sizeof(void*))));
     for (auto i = 0; i < n; ++i) {
       ((const void**)x_datas_host)[i] = x[i]->data<T>();
     }
@@ -55,9 +63,12 @@ class StackXPUKernel : public framework::OpKernel<T> {
                  n * sizeof(void*));
     int r = xpu::stack_forward<float>(dev_ctx.x_context(), pre, post, n,
                                       x_datas_device, y_data);
-    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                      platform::errors::InvalidArgument(
-                          "There are stack XPU kernel error raised!"));
+    PADDLE_ENFORCE_EQ(
+        r, xpu::Error_t::SUCCESS,
+        platform::errors::External(
+            "The stack XPU API return wrong value[%d], please check "
+            "where Baidu Kunlun Card is properly installed.",
+            r));
     dev_ctx.Wait();
     std::free(x_datas_host);
     xpu_free(x_datas_device);