PaddlePaddle
diff --git a/‎cmake/operators.cmake‎
Lines changed: 1 addition & 1 deletion b/‎cmake/operators.cmake‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.cc‎
Lines changed: 17 additions & 8 deletions b/‎paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.cc‎
Lines changed: 17 additions & 8 deletions
diff --git a/‎paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.h‎
Lines changed: 1 addition & 1 deletion b/‎paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/fluid/framework/distributed_strategy.proto‎
Lines changed: 1 addition & 1 deletion b/‎paddle/fluid/framework/distributed_strategy.proto‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/fluid/framework/ir/conv_bn_fuse_pass.cc‎
Lines changed: 12 additions & 1 deletion b/‎paddle/fluid/framework/ir/conv_bn_fuse_pass.cc‎
Lines changed: 12 additions & 1 deletion
diff --git a/‎paddle/fluid/framework/ir/graph_pattern_detector.cc‎
Lines changed: 10 additions & 7 deletions b/‎paddle/fluid/framework/ir/graph_pattern_detector.cc‎
Lines changed: 10 additions & 7 deletions
diff --git a/‎paddle/fluid/operators/fused/CMakeLists.txt‎
Lines changed: 4 additions & 0 deletions b/‎paddle/fluid/operators/fused/CMakeLists.txt‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎paddle/fluid/operators/fused/fusion_lstm_op.cc‎
Lines changed: 16 additions & 2 deletions b/‎paddle/fluid/operators/fused/fusion_lstm_op.cc‎
Lines changed: 16 additions & 2 deletions
@@ -197,7 +197,7 @@ function(op_library TARGET)
 "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
 "fusion_transpose_flatten_concat_op" "fusion_conv_inception_op"
 "sync_batch_norm_op" "dgc_op" "fused_fc_elementwise_layernorm_op"
-"skip_layernorm_op" "multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op"
+"skip_layernorm_op" "multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op" "fusion_lstm_op"
 "fused_bn_add_activation_op")
         if ("${TARGET}" STREQUAL "${manual_pybind_op}")
             set(pybind_flag 1)
 
@@ -122,8 +122,11 @@ FetchResultType BindThreadedSSAGraphExecutor::RunMainStream(
   for (auto cur_op : ready_fetch_ops) {
     ready_ops->Push(cur_op);
   }
-  // Atomic variable, no need to lock
-  exec_op_count_ = 0;
+
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    exec_op_count_ = 0;
+  }
 
   platform::XPUPlace cur_place;
   std::size_t cur_count = 0;
@@ -133,6 +136,7 @@ FetchResultType BindThreadedSSAGraphExecutor::RunMainStream(
     auto cur_op = ready_ops->Pop();
     // when execption, get cur_op == nullptr
     if (cur_op == nullptr) {
+      std::lock_guard<std::mutex> lock(mutex_);
       exec_op_count_ = op_deps_.size();
       break;
     }
@@ -151,6 +155,7 @@ FetchResultType BindThreadedSSAGraphExecutor::RunMainStream(
     std::unique_lock<std::mutex> lock(mutex_);
     cv_.wait(lock, [&] { return exec_op_count_ >= op_deps_.size(); });
   }
+
   if (exception_.IsCaught()) {
     ExecutionFinal(&fetch_ops);
   }
@@ -255,9 +260,11 @@ void BindThreadedSSAGraphExecutor::RunMultiDeviceOpAsync(
       ready_ops->Push(nullptr);
       exception_.Catch(std::current_exception());
     }
-    // Atomic variable, no need to lock
-    exec_op_count_++;
-    cv_.notify_all();
+    {
+      std::lock_guard<std::mutex> lock(mutex_);
+      exec_op_count_++;
+      cv_.notify_all();
+    }
   });
 }
 // RunOpAsyncMainStream function is used for computed OPs
@@ -286,9 +293,11 @@ void BindThreadedSSAGraphExecutor::RunOpAsyncMainStream(
       ready_ops->Push(nullptr);
       exception_.Catch(std::current_exception());
     }
-    // Atomic variable, no need to lock
-    exec_op_count_++;
-    cv_.notify_all();
+    {
+      std::lock_guard<std::mutex> lock(mutex_);
+      exec_op_count_++;
+      cv_.notify_all();
+    }
   });
 }
 
 
@@ -80,7 +80,7 @@ class BindThreadedSSAGraphExecutor : public SSAGraphExecutor {
 
   std::mutex mutex_;
   std::condition_variable cv_;
-  std::atomic<unsigned int> exec_op_count_;
+  uint32_t exec_op_count_;
   std::atomic<int> error_state;
 
   void RunOpAsyncMainStream(
 
@@ -140,7 +140,7 @@ message DistributedStrategy {
   optional int32 fuse_grad_size_in_MB = 19 [ default = 32 ];
   optional float fuse_grad_size_in_TFLOPS = 20 [ default = 50 ];
   optional bool cudnn_exhaustive_search = 21 [ default = false ];
-  optional int32 conv_workspace_size_limit = 22 [ default = 4000 ];
+  optional int32 conv_workspace_size_limit = 22 [ default = 512 ];
   optional bool cudnn_batchnorm_spatial_persistent = 23 [ default = false ];
   optional bool adaptive_localsgd = 24 [ default = false ];
   optional bool fp16_allreduce = 25 [ default = false ];
 
@@ -95,13 +95,24 @@ void recompute_bias_and_weights(const Scope* scope,
   variance_array += epsilon;
   variance_array = variance_array.sqrt();
   variance_array = scale_array / variance_array;
-
+  for (int i = 0; i < variance_tensor->numel(); i++) {
+    PADDLE_ENFORCE_EQ(
+        isfinite(variance_array[i]), true,
+        platform::errors::InvalidArgument("fuse batch norm variance should be "
+                                          "finite. Found nonfinite values!"));
+  }
   EigenVectorArrayMap eltwise_y_in_array(
       eltwise_y_in_tensor->mutable_data<float>(platform::CPUPlace()),
       eltwise_y_in_tensor->numel(), 1);
 
   eltwise_y_in_array =
       ((eltwise_y_in_array - mean_array) * variance_array) + bn_bias_array;
+  for (int i = 0; i < eltwise_y_in_tensor->numel(); i++) {
+    PADDLE_ENFORCE_EQ(
+        isfinite(eltwise_y_in_array[i]), true,
+        platform::errors::InvalidArgument("fused batch norm bias should be "
+                                          "finite. Found nonfinite values!"));
+  }
 
   // Re-compute weight of conv2d from BN
   auto* weights = scope->FindVar(conv_weight->Name())->GetMutable<LoDTensor>();
 
@@ -824,22 +824,25 @@ PDNode *patterns::ConvBN::operator()(paddle::framework::ir::PDNode *conv_input,
 
   auto *bn_mean_out_var = pattern->NewNode(bn_mean_out_repr())
                               ->AsOutput()
-                              ->assert_is_op_output("batch_norm", "MeanOut");
+                              ->assert_is_op_output("batch_norm", "MeanOut")
+                              ->assert_has_n_outputs(0);
 
   auto *bn_variance_out_var =
       pattern->NewNode(bn_variance_out_repr())
           ->AsOutput()
-          ->assert_is_op_output("batch_norm", "VarianceOut");
+          ->assert_is_op_output("batch_norm", "VarianceOut")
+          ->assert_has_n_outputs(0);
 
-  auto *bn_saved_mean_var =
-      pattern->NewNode(bn_saved_mean_repr())
-          ->AsOutput()
-          ->assert_is_op_output("batch_norm", "SavedMean");
+  auto *bn_saved_mean_var = pattern->NewNode(bn_saved_mean_repr())
+                                ->AsOutput()
+                                ->assert_is_op_output("batch_norm", "SavedMean")
+                                ->assert_has_n_outputs(0);
 
   auto *bn_saved_variance_var =
       pattern->NewNode(bn_saved_variance_repr())
           ->AsOutput()
-          ->assert_is_op_output("batch_norm", "SavedVariance");
+          ->assert_is_op_output("batch_norm", "SavedVariance")
+          ->assert_has_n_outputs(0);
 
   conv_op->LinksFrom({conv_input, conv_weight_var}).LinksTo({conv_out_var});
 
 
@@ -14,11 +14,15 @@ register_operators(EXCLUDES
     fused_embedding_eltwise_layernorm_op
     fusion_group_op
     fusion_gru_op
+    fusion_lstm_op
     fused_bn_add_activation_op)
 
 # fusion_gru_op does not have CUDA kernel
 op_library(fusion_gru_op)
+op_library(fusion_lstm_op)
 file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(fusion_gru);\n")
+file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(fusion_lstm);\n")
+
 
 if (WITH_GPU)
     # fused_bn_activation_op needs cudnn 7.4.1 above
 
@@ -18,6 +18,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/fc.h"
 #include "paddle/fluid/operators/math/sequence2batch.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
 
 namespace paddle {
 namespace operators {
@@ -145,8 +148,16 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
 
 framework::OpKernelType FusionLSTMOp::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
-  return framework::OpKernelType(
-      OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.device_context());
+  framework::LibraryType library = framework::LibraryType::kPlain;
+  framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+  auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+#ifdef PADDLE_WITH_MKLDNN
+  if (this->CanMKLDNNBeUsed(ctx, data_type)) {
+    library = framework::LibraryType::kMKLDNN;
+    layout = framework::DataLayout::kMKLDNN;
+  }
+#endif
+  return framework::OpKernelType(data_type, ctx.GetPlace(), layout, library);
 }
 
 void FusionLSTMOpMaker::Make() {
@@ -235,6 +246,9 @@ void FusionLSTMOpMaker::Make() {
                        "`tanh` by default.")
       .SetDefault("tanh")
       .InEnum({"sigmoid", "tanh", "relu", "identity"});
+  AddAttr<bool>("use_mkldnn",
+                "(bool, default false) Only used in mkldnn kernel")
+      .SetDefault(false);
   AddComment(R"DOC(
 Fusion Long-Short Term Memory (LSTM) Operator.
 This operator fuse the X into LSTM, more details can refer to LSTM op.