Skip to content

Commit 311a5f0

Browse files
committed
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into dev/fuse_all_opt
2 parents 11b2793 + ab0272e commit 311a5f0

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

56 files changed

+1760
-394
lines changed

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -216,7 +216,7 @@ option(WITH_STRIP "Strip so files of Whl packages" OFF)
216216

217217
# PY_VERSION
218218
if(NOT PY_VERSION)
219-
set(PY_VERSION 3.7)
219+
set(PY_VERSION 3.6)
220220
endif()
221221
set(PYBIND11_PYTHON_VERSION ${PY_VERSION})
222222

README.md

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
<p align="center">
1+
<p align="center">
22
<img align="center" src="doc/imgs/logo.png", width=1600>
33
<p>
44

@@ -50,10 +50,9 @@ Now our developers can acquire Tesla V100 online computing resources for free. I
5050
[Click here to learn more](https://github.com/PaddlePaddle/Fleet)
5151

5252

53-
- **Accelerated High-Performance Inference over Ubiquitous Deployments**
53+
- **High-Performance Inference Engines for Comprehensive Deployment Enviroments**
5454

55-
PaddlePaddle is not only compatible with other open-source frameworks for models training, but also works well on the ubiquitous developments, varying from platforms to devices. More specifically, PaddlePaddle accelerates the inference procedure with the fastest speed-up. Note that, a recent breakthrough of inference speed has been made by PaddlePaddle on Huawei's Kirin NPU, through the hardware/software co-optimization.
56-
[Click here to learn more](https://github.com/PaddlePaddle/Paddle-Lite)
55+
PaddlePaddle is not only compatible with models trained in 3rd party open-source frameworks , but also offers complete inference products for various production scenarios. Our inference product line includes [Paddle Inference](https://paddle-inference.readthedocs.io/en/latest/product_introduction/summary.html): Native inference library for high performance server and cloud inference; [Paddle Serving](https://github.com/PaddlePaddle/Serving): A service-oriented framework suitable for distributed and pipeline productions; [Paddle Lite](https://github.com/PaddlePaddle/Paddle-Lite): Ultra-Lightweight inference engine for mobile and IoT enviroments; [Paddle.js](https://www.paddlepaddle.org.cn/paddle/paddlejs): A frontend inference engine for browser and mini apps. Futhermore, by great amounts of optimization with leading hardwares in each scenarios, Paddle inference engines outperform most of the other mainstream frameworks.
5756

5857

5958
- **Industry-Oriented Models and Libraries with Open Source Repositories**

README_cn.md

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
1+
22
<p align="center">
33
<img align="center" src="doc/imgs/logo.png", width=1600>
44
<p>
@@ -47,10 +47,9 @@ PaddlePaddle用户可领取**免费Tesla V100在线算力资源**,训练模型
4747
[查看详情](https://github.com/PaddlePaddle/Fleet)
4848

4949

50-
- **多端多平台部署的高性能推理引擎**
50+
- **支持多端多平台的高性能推理部署工具**
5151

52-
飞桨不仅兼容其他开源框架训练的模型,还可以轻松地部署到不同架构的平台设备上。同时,飞桨的推理速度也是全面领先的。尤其经过了跟华为麒麟NPU的软硬一体优化,使得飞桨在NPU上的推理速度进一步突破。
53-
[查看详情](https://github.com/PaddlePaddle/Paddle-Lite)
52+
飞桨不仅广泛兼容第三方开源框架训练的模型部署,并且为不同的场景的生产环境提供了完备的推理引擎,包括适用于高性能服务器及云端推理的原生推理库 [Paddle Inference](https://paddle-inference.readthedocs.io/en/latest/product_introduction/summary.html),面向分布式、流水线生产环境下自动上云、A/B测试等高阶功能的服务化推理框架 [Paddle Serving](https://github.com/PaddlePaddle/Serving),针对于移动端、物联网场景的轻量化推理引擎 [Paddle Lite](https://github.com/PaddlePaddle/Paddle-Lite),以及在浏览器、小程序等环境下使用的前端推理引擎 [Paddle.js](https://www.paddlepaddle.org.cn/paddle/paddlejs)。同时,透过与不同场景下的主流硬件高度适配优化及异构计算的支持, 飞桨的推理性能也领先绝大部分的主流实现。
5453

5554

5655
- **面向产业应用,开源开放覆盖多领域的工业级模型库。**

cmake/cblas.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ if(NOT DEFINED CBLAS_PROVIDER)
7373
string(REGEX MATCH "OpenBLAS ([0-9]+\.[0-9]+\.[0-9]+)" tmp ${config_file})
7474
string(REGEX MATCH "([0-9]+\.[0-9]+\.[0-9]+)" ver ${tmp})
7575

76-
if (${ver} VERSION_EQUAL "0.3.7")
76+
if (${ver} VERSION_GREATER_EQUAL "0.3.7")
7777
set(CBLAS_PROVIDER OPENBLAS)
7878
set(CBLAS_INC_DIR ${OPENBLAS_INC_DIR} ${OPENBLAS_LAPACKE_INC_DIR})
7979
set(CBLAS_LIBRARIES ${OPENBLAS_LIB})

paddle/fluid/framework/ir/op_compat_sensible_pass.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ InputOrOutputCompat& InputOrOutputCompat::IsOptional() {
117117

118118
bool InputOrOutputCompat::operator()(
119119
const std::vector<std::string>& input) const {
120-
if (input.empty()) return false;
120+
if (input.empty()) return optional_;
121121
for (auto& func : conditions_) {
122122
if (!func(input)) {
123123
return false;

paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,11 @@ void SkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const {
129129
return;
130130
}
131131

132+
if (!IsCompat(subgraph, graph)) {
133+
LOG(WARNING) << "skip_layernorm pass in op compat failed.";
134+
return;
135+
}
136+
132137
VLOG(4) << "handle SkipLayerNorm fuse";
133138
GET_IR_NODE_FROM_SUBGRAPH(elementwise, elementwise, fused_pattern);
134139
GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out, fused_pattern);

paddle/fluid/framework/ir/skip_layernorm_fuse_pass.h

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,49 @@ class Graph;
3333

3434
class SkipLayerNormFusePass : public FusePassBase {
3535
public:
36+
SkipLayerNormFusePass() {
37+
AddOpCompat(OpCompat("elementwise_add"))
38+
.AddInput("X")
39+
.IsTensor()
40+
.End()
41+
.AddInput("Y")
42+
.IsTensor()
43+
.End()
44+
.AddOutput("Out")
45+
.IsTensor()
46+
.End()
47+
.AddAttr("axis")
48+
.IsIntIn({0, -1})
49+
.End();
50+
51+
AddOpCompat(OpCompat("layer_norm"))
52+
.AddInput("X")
53+
.IsTensor()
54+
.End()
55+
.AddInput("Scale")
56+
.IsTensor()
57+
.End()
58+
.AddInput("Bias")
59+
.IsTensor()
60+
.End()
61+
.AddOutput("Y")
62+
.IsTensor()
63+
.End()
64+
.AddOutput("Mean")
65+
.IsTensor()
66+
.End()
67+
.AddOutput("Variance")
68+
.IsTensor()
69+
.End()
70+
.AddAttr("epsilon")
71+
.IsNumGE(0.0f)
72+
.IsNumLE(0.001f)
73+
.End()
74+
.AddAttr("begin_norm_axis")
75+
.IsNumGT(0)
76+
.End();
77+
}
78+
3679
virtual ~SkipLayerNormFusePass() {}
3780

3881
protected:

paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
3636
framework::OpDesc op_desc(op, nullptr);
3737
auto word_id_name = op_desc.Input("WordId").front();
3838
auto pos_id_name = op_desc.Input("PosId").front();
39+
engine_->Set("ernie_pos_name", new std::string(pos_id_name));
40+
3941
auto sent_id_name = op_desc.Input("SentId").front();
4042
auto word_emb_name = op_desc.Input("WordEmbedding").front();
4143
auto pos_emb_name = op_desc.Input("PosEmbedding").front();

paddle/fluid/inference/tensorrt/convert/fc_op.cc

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ class FcOpConverter : public OpConverter {
4848
}
4949
// Declare inputs
5050
auto* X = engine_->GetITensor(op_desc.Input(i_name).front());
51+
auto x_dim = X->getDimensions();
5152
// Declare weights
5253
auto* Y_v = scope.FindVar(op_desc.Input(w_name).front());
5354
PADDLE_ENFORCE_NOT_NULL(
@@ -138,7 +139,13 @@ class FcOpConverter : public OpConverter {
138139
("fc_layer_before(Output: " + output_name + ")").c_str());
139140
// add shuffle after fc
140141
nvinfer1::Dims reshape_after_fc_dim;
141-
reshape_after_fc_dim.nbDims = x_num_col_dims + 1;
142+
if (engine_->use_oss() && engine_->with_ernie() && x_dim.nbDims == 4 &&
143+
x_dim.d[2] == 1 && x_dim.d[3] == 1 && x_num_col_dims == 1) {
144+
// If use tensorrt'oss, the x_dim and x_num_col_dims need change
145+
reshape_after_fc_dim.nbDims = 4;
146+
} else {
147+
reshape_after_fc_dim.nbDims = x_num_col_dims + 1;
148+
}
142149
for (int i = 0; i < reshape_after_fc_dim.nbDims; i++) {
143150
reshape_after_fc_dim.d[i] = 0;
144151
}
@@ -181,11 +188,15 @@ class FcOpConverter : public OpConverter {
181188
static_cast<void*>(bias_data),
182189
static_cast<size_t>(bias_num)};
183190

184-
auto x_dim = X->getDimensions();
185191
// Running the TRT Static Shape mode: x_num_col_dims-1
186192
if (!engine_->with_dynamic_shape()) {
187193
x_num_col_dims--;
188194
}
195+
// If use tensorrt'oss, the x_dim and x_num_col_dims need change
196+
if (engine_->use_oss() && engine_->with_ernie() && x_dim.nbDims == 4 &&
197+
x_dim.d[2] == 1 && x_dim.d[3] == 1 && x_num_col_dims == 2) {
198+
x_num_col_dims = 1;
199+
}
189200
PADDLE_ENFORCE_GT(
190201
x_dim.nbDims, x_num_col_dims,
191202
platform::errors::InvalidArgument(

paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -191,9 +191,15 @@ class MultiheadMatMulOpConverter : public OpConverter {
191191
std::vector<nvinfer1::ITensor*> plugin_inputs;
192192
plugin_inputs.emplace_back(fc_layer->getOutput(0));
193193
plugin_inputs.emplace_back(mask_tensor);
194-
plugin_inputs.emplace_back(engine_->GetITensor(
195-
engine_->network()->getInput(2)->getName())); // cu_seqlens,
196-
// eval_placeholder_2
194+
if (engine_->Has("ernie_pos_name")) {
195+
plugin_inputs.emplace_back(
196+
engine_->GetITensor(engine_->Get<std::string>("ernie_pos_name")));
197+
} else {
198+
plugin_inputs.emplace_back(engine_->GetITensor(
199+
engine_->network()
200+
->getInput(2)
201+
->getName())); // cu_seqlens, eval_placeholder_2
202+
}
197203
auto max_seqlen_tensor =
198204
engine_->GetITensor(engine_->network()->getInput(3)->getName());
199205
auto* shuffle_layer = TRT_ENGINE_ADD_LAYER(

0 commit comments

Comments
 (0)