Skip to content

Commit 2ad5aca

Browse files
author
sandyhouse
committed
Merge branch 'fix_default_strategy_value' of https://github.com/sandyhouse/Paddle into fix_default_strategy_value
2 parents c568613 + 293a5fc commit 2ad5aca

24 files changed

+924
-286
lines changed

cmake/operators.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,7 @@ function(op_library TARGET)
197197
"tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
198198
"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op"
199199
"sync_batch_norm_op" "dgc_op" "fused_fc_elementwise_layernorm_op"
200-
"skip_layernorm_op" "multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op"
200+
"skip_layernorm_op" "multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op" "fusion_lstm_op"
201201
"fused_bn_add_activation_op")
202202
if ("${TARGET}" STREQUAL "${manual_pybind_op}")
203203
set(pybind_flag 1)

paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.cc

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -122,8 +122,11 @@ FetchResultType BindThreadedSSAGraphExecutor::RunMainStream(
122122
for (auto cur_op : ready_fetch_ops) {
123123
ready_ops->Push(cur_op);
124124
}
125-
// Atomic variable, no need to lock
126-
exec_op_count_ = 0;
125+
126+
{
127+
std::lock_guard<std::mutex> lock(mutex_);
128+
exec_op_count_ = 0;
129+
}
127130

128131
platform::XPUPlace cur_place;
129132
std::size_t cur_count = 0;
@@ -133,6 +136,7 @@ FetchResultType BindThreadedSSAGraphExecutor::RunMainStream(
133136
auto cur_op = ready_ops->Pop();
134137
// when execption, get cur_op == nullptr
135138
if (cur_op == nullptr) {
139+
std::lock_guard<std::mutex> lock(mutex_);
136140
exec_op_count_ = op_deps_.size();
137141
break;
138142
}
@@ -151,6 +155,7 @@ FetchResultType BindThreadedSSAGraphExecutor::RunMainStream(
151155
std::unique_lock<std::mutex> lock(mutex_);
152156
cv_.wait(lock, [&] { return exec_op_count_ >= op_deps_.size(); });
153157
}
158+
154159
if (exception_.IsCaught()) {
155160
ExecutionFinal(&fetch_ops);
156161
}
@@ -255,9 +260,11 @@ void BindThreadedSSAGraphExecutor::RunMultiDeviceOpAsync(
255260
ready_ops->Push(nullptr);
256261
exception_.Catch(std::current_exception());
257262
}
258-
// Atomic variable, no need to lock
259-
exec_op_count_++;
260-
cv_.notify_all();
263+
{
264+
std::lock_guard<std::mutex> lock(mutex_);
265+
exec_op_count_++;
266+
cv_.notify_all();
267+
}
261268
});
262269
}
263270
// RunOpAsyncMainStream function is used for computed OPs
@@ -286,9 +293,11 @@ void BindThreadedSSAGraphExecutor::RunOpAsyncMainStream(
286293
ready_ops->Push(nullptr);
287294
exception_.Catch(std::current_exception());
288295
}
289-
// Atomic variable, no need to lock
290-
exec_op_count_++;
291-
cv_.notify_all();
296+
{
297+
std::lock_guard<std::mutex> lock(mutex_);
298+
exec_op_count_++;
299+
cv_.notify_all();
300+
}
292301
});
293302
}
294303

paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ class BindThreadedSSAGraphExecutor : public SSAGraphExecutor {
8080

8181
std::mutex mutex_;
8282
std::condition_variable cv_;
83-
std::atomic<unsigned int> exec_op_count_;
83+
uint32_t exec_op_count_;
8484
std::atomic<int> error_state;
8585

8686
void RunOpAsyncMainStream(

paddle/fluid/framework/distributed_strategy.proto

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ message DistributedStrategy {
140140
optional int32 fuse_grad_size_in_MB = 19 [ default = 32 ];
141141
optional float fuse_grad_size_in_TFLOPS = 20 [ default = 50 ];
142142
optional bool cudnn_exhaustive_search = 21 [ default = false ];
143-
optional int32 conv_workspace_size_limit = 22 [ default = 4000 ];
143+
optional int32 conv_workspace_size_limit = 22 [ default = 512 ];
144144
optional bool cudnn_batchnorm_spatial_persistent = 23 [ default = false ];
145145
optional bool adaptive_localsgd = 24 [ default = false ];
146146
optional bool fp16_allreduce = 25 [ default = false ];

paddle/fluid/framework/ir/conv_bn_fuse_pass.cc

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,13 +95,24 @@ void recompute_bias_and_weights(const Scope* scope,
9595
variance_array += epsilon;
9696
variance_array = variance_array.sqrt();
9797
variance_array = scale_array / variance_array;
98-
98+
for (int i = 0; i < variance_tensor->numel(); i++) {
99+
PADDLE_ENFORCE_EQ(
100+
isfinite(variance_array[i]), true,
101+
platform::errors::InvalidArgument("fuse batch norm variance should be "
102+
"finite. Found nonfinite values!"));
103+
}
99104
EigenVectorArrayMap eltwise_y_in_array(
100105
eltwise_y_in_tensor->mutable_data<float>(platform::CPUPlace()),
101106
eltwise_y_in_tensor->numel(), 1);
102107

103108
eltwise_y_in_array =
104109
((eltwise_y_in_array - mean_array) * variance_array) + bn_bias_array;
110+
for (int i = 0; i < eltwise_y_in_tensor->numel(); i++) {
111+
PADDLE_ENFORCE_EQ(
112+
isfinite(eltwise_y_in_array[i]), true,
113+
platform::errors::InvalidArgument("fused batch norm bias should be "
114+
"finite. Found nonfinite values!"));
115+
}
105116

106117
// Re-compute weight of conv2d from BN
107118
auto* weights = scope->FindVar(conv_weight->Name())->GetMutable<LoDTensor>();

paddle/fluid/framework/ir/graph_pattern_detector.cc

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -824,22 +824,25 @@ PDNode *patterns::ConvBN::operator()(paddle::framework::ir::PDNode *conv_input,
824824

825825
auto *bn_mean_out_var = pattern->NewNode(bn_mean_out_repr())
826826
->AsOutput()
827-
->assert_is_op_output("batch_norm", "MeanOut");
827+
->assert_is_op_output("batch_norm", "MeanOut")
828+
->assert_has_n_outputs(0);
828829

829830
auto *bn_variance_out_var =
830831
pattern->NewNode(bn_variance_out_repr())
831832
->AsOutput()
832-
->assert_is_op_output("batch_norm", "VarianceOut");
833+
->assert_is_op_output("batch_norm", "VarianceOut")
834+
->assert_has_n_outputs(0);
833835

834-
auto *bn_saved_mean_var =
835-
pattern->NewNode(bn_saved_mean_repr())
836-
->AsOutput()
837-
->assert_is_op_output("batch_norm", "SavedMean");
836+
auto *bn_saved_mean_var = pattern->NewNode(bn_saved_mean_repr())
837+
->AsOutput()
838+
->assert_is_op_output("batch_norm", "SavedMean")
839+
->assert_has_n_outputs(0);
838840

839841
auto *bn_saved_variance_var =
840842
pattern->NewNode(bn_saved_variance_repr())
841843
->AsOutput()
842-
->assert_is_op_output("batch_norm", "SavedVariance");
844+
->assert_is_op_output("batch_norm", "SavedVariance")
845+
->assert_has_n_outputs(0);
843846

844847
conv_op->LinksFrom({conv_input, conv_weight_var}).LinksTo({conv_out_var});
845848

paddle/fluid/operators/fused/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,15 @@ register_operators(EXCLUDES
1414
fused_embedding_eltwise_layernorm_op
1515
fusion_group_op
1616
fusion_gru_op
17+
fusion_lstm_op
1718
fused_bn_add_activation_op)
1819

1920
# fusion_gru_op does not have CUDA kernel
2021
op_library(fusion_gru_op)
22+
op_library(fusion_lstm_op)
2123
file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(fusion_gru);\n")
24+
file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(fusion_lstm);\n")
25+
2226

2327
if (WITH_GPU)
2428
# fused_bn_activation_op needs cudnn 7.4.1 above

paddle/fluid/operators/fused/fusion_lstm_op.cc

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@ limitations under the License. */
1818
#include "paddle/fluid/operators/math/blas.h"
1919
#include "paddle/fluid/operators/math/fc.h"
2020
#include "paddle/fluid/operators/math/sequence2batch.h"
21+
#ifdef PADDLE_WITH_MKLDNN
22+
#include "paddle/fluid/platform/mkldnn_helper.h"
23+
#endif
2124

2225
namespace paddle {
2326
namespace operators {
@@ -145,8 +148,16 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
145148

146149
framework::OpKernelType FusionLSTMOp::GetExpectedKernelType(
147150
const framework::ExecutionContext& ctx) const {
148-
return framework::OpKernelType(
149-
OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.device_context());
151+
framework::LibraryType library = framework::LibraryType::kPlain;
152+
framework::DataLayout layout = framework::DataLayout::kAnyLayout;
153+
auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
154+
#ifdef PADDLE_WITH_MKLDNN
155+
if (this->CanMKLDNNBeUsed(ctx, data_type)) {
156+
library = framework::LibraryType::kMKLDNN;
157+
layout = framework::DataLayout::kMKLDNN;
158+
}
159+
#endif
160+
return framework::OpKernelType(data_type, ctx.GetPlace(), layout, library);
150161
}
151162

152163
void FusionLSTMOpMaker::Make() {
@@ -235,6 +246,9 @@ void FusionLSTMOpMaker::Make() {
235246
"`tanh` by default.")
236247
.SetDefault("tanh")
237248
.InEnum({"sigmoid", "tanh", "relu", "identity"});
249+
AddAttr<bool>("use_mkldnn",
250+
"(bool, default false) Only used in mkldnn kernel")
251+
.SetDefault(false);
238252
AddComment(R"DOC(
239253
Fusion Long-Short Term Memory (LSTM) Operator.
240254
This operator fuse the X into LSTM, more details can refer to LSTM op.

0 commit comments

Comments
 (0)