Support dropout backward in eval mode (#35122)

smallv0221 · web-flow · commit f1275fb60599 · 2021-08-26T19:28:01.000+08:00
* Support dropout backward in eval mode

* add downscale case

* minor fix

* minor fix
diff --git a/paddle/fluid/operators/dropout_op.cc b/paddle/fluid/operators/dropout_op.cc
@@ -117,10 +117,6 @@ class DropoutOpGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->Attrs().Get<bool>("is_test"), false,
-                      platform::errors::InvalidArgument(
-                          "GradOp is only callable when is_test is false"));
-
     OP_INOUT_CHECK(ctx->HasInput("Mask"), "Input", "Mask", "DropoutGrad");
     OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
                    framework::GradVarName("Out"), "DropoutGrad");
diff --git a/paddle/fluid/operators/dropout_op.h b/paddle/fluid/operators/dropout_op.h
@@ -160,50 +160,54 @@ template <typename DeviceContext, typename T>
 class DropoutGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE_EQ(!context.Attr<bool>("is_test"), true,
-                      platform::errors::PreconditionNotMet(
-                          "GradOp is only callable when is_test is false"));
-
     auto* grad_x = context.Output<Tensor>(framework::GradVarName("X"));
     auto* grad_y = context.Input<Tensor>(framework::GradVarName("Out"));
     auto* mask = context.Input<Tensor>("Mask");
     grad_x->mutable_data<T>(context.GetPlace());
     auto size = grad_x->numel();
 
-    auto M = EigenVector<uint8_t>::Flatten(*mask);
     auto dX = EigenVector<T>::Flatten(*grad_x);
     auto dY = EigenVector<T>::Flatten(*grad_y);
 
     auto& place =
         *context.template device_context<DeviceContext>().eigen_device();
     auto& dropout_implementation =
         context.Attr<std::string>("dropout_implementation");
-    if (dropout_implementation == "upscale_in_train") {
-      float dropout_prob = context.Attr<float>("dropout_prob");
-      if (dropout_prob == 1.0f) {
-        dX.device(place) = static_cast<T>(0) * dY;
+    if (context.Attr<bool>("is_test") == true) {
+      if (dropout_implementation == "upscale_in_train") {
+        dX.device(place) = static_cast<T>(1) * dY;
       } else {
-        int vec_size = VectorizedSize<T>(grad_y->data<T>());
-        if (platform::is_gpu_place(context.GetPlace()) && vec_size == 4 &&
-            size % 4 == 0) {
+        float dropout_prob = context.Attr<float>("dropout_prob");
+        dX.device(place) = dY * static_cast<T>(1.0f - dropout_prob);
+      }
+    } else {
+      auto M = EigenVector<uint8_t>::Flatten(*mask);
+      if (dropout_implementation == "upscale_in_train") {
+        float dropout_prob = context.Attr<float>("dropout_prob");
+        if (dropout_prob == 1.0f) {
+          dX.device(place) = static_cast<T>(0) * dY;
+        } else {
+          int vec_size = VectorizedSize<T>(grad_y->data<T>());
+          if (platform::is_gpu_place(context.GetPlace()) && vec_size == 4 &&
+              size % 4 == 0) {
 #if defined(__NVCC__) || defined(__HIPCC__)
-          auto factor = static_cast<T>(1.0f / (1.0f - dropout_prob));
-          auto stream = context.cuda_device_context().stream();
-          platform::GpuLaunchConfig config = platform::GetGpuLaunchConfig1D(
-              context.cuda_device_context(), size);
-          DropoutGradCUDAKernel<
-              T, uint8_t,
-              4><<<config.block_per_grid, config.thread_per_block, 0, stream>>>(
-              grad_y->data<T>(), mask->data<uint8_t>(), factor, size,
-              grad_x->data<T>());
+            auto factor = static_cast<T>(1.0f / (1.0f - dropout_prob));
+            auto stream = context.cuda_device_context().stream();
+            platform::GpuLaunchConfig config = platform::GetGpuLaunchConfig1D(
+                context.cuda_device_context(), size);
+            DropoutGradCUDAKernel<T, uint8_t, 4><<<
+                config.block_per_grid, config.thread_per_block, 0, stream>>>(
+                grad_y->data<T>(), mask->data<uint8_t>(), factor, size,
+                grad_x->data<T>());
 #endif
-        } else {
-          dX.device(place) =
-              dY * M.cast<T>() / static_cast<T>(1.0f - dropout_prob);
+          } else {
+            dX.device(place) =
+                dY * M.cast<T>() / static_cast<T>(1.0f - dropout_prob);
+          }
         }
+      } else {
+        dX.device(place) = dY * M.cast<T>();
       }
-    } else {
-      dX.device(place) = dY * M.cast<T>();
     }
   }
 };