PaddlePaddle
diff --git a/‎CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion b/‎CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 3 additions & 4 deletions b/‎README.md‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎README_cn.md‎
Lines changed: 3 additions & 4 deletions b/‎README_cn.md‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎cmake/cblas.cmake‎
Lines changed: 1 addition & 1 deletion b/‎cmake/cblas.cmake‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/fluid/framework/ir/op_compat_sensible_pass.cc‎
Lines changed: 1 addition & 1 deletion b/‎paddle/fluid/framework/ir/op_compat_sensible_pass.cc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc‎
Lines changed: 5 additions & 0 deletions b/‎paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎paddle/fluid/framework/ir/skip_layernorm_fuse_pass.h‎
Lines changed: 43 additions & 0 deletions b/‎paddle/fluid/framework/ir/skip_layernorm_fuse_pass.h‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc‎
Lines changed: 2 additions & 0 deletions b/‎paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎paddle/fluid/inference/tensorrt/convert/fc_op.cc‎
Lines changed: 13 additions & 2 deletions b/‎paddle/fluid/inference/tensorrt/convert/fc_op.cc‎
Lines changed: 13 additions & 2 deletions
diff --git a/‎paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc‎
Lines changed: 9 additions & 3 deletions b/‎paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc‎
Lines changed: 9 additions & 3 deletions
@@ -216,7 +216,7 @@ option(WITH_STRIP       "Strip so files of Whl packages"         OFF)
 
 # PY_VERSION
 if(NOT PY_VERSION)
-  set(PY_VERSION 3.7)
+  set(PY_VERSION 3.6)
 endif()
 set(PYBIND11_PYTHON_VERSION ${PY_VERSION})
 
 
@@ -1,4 +1,4 @@
-<p align="center">
+<p align="center">
 <img align="center" src="doc/imgs/logo.png", width=1600>
 <p>
 
@@ -50,10 +50,9 @@ Now our developers can acquire Tesla V100 online computing resources for free. I
      [Click here to learn more](https://github.com/PaddlePaddle/Fleet)
 
 
-- **Accelerated High-Performance Inference over Ubiquitous Deployments**
+- **High-Performance Inference Engines for Comprehensive Deployment Enviroments**
 
-    PaddlePaddle is not only compatible with other open-source frameworks for models training, but also works well on the ubiquitous developments, varying from platforms to devices. More specifically, PaddlePaddle accelerates the inference procedure with the fastest speed-up. Note that, a recent breakthrough of inference speed has been made by PaddlePaddle on Huawei's Kirin NPU, through the hardware/software co-optimization.
-     [Click here to learn more](https://github.com/PaddlePaddle/Paddle-Lite)
+   PaddlePaddle is not only compatible with models trained in 3rd party open-source frameworks , but also offers complete inference products for various production scenarios. Our inference product line includes [Paddle Inference](https://paddle-inference.readthedocs.io/en/latest/product_introduction/summary.html): Native inference library for high performance server and cloud inference; [Paddle Serving](https://github.com/PaddlePaddle/Serving): A service-oriented framework suitable for distributed and pipeline productions; [Paddle Lite](https://github.com/PaddlePaddle/Paddle-Lite): Ultra-Lightweight inference engine for mobile and IoT enviroments; [Paddle.js](https://www.paddlepaddle.org.cn/paddle/paddlejs): A frontend inference engine for browser and mini apps. Futhermore, by great amounts of optimization with leading hardwares in each scenarios, Paddle inference engines outperform most of the other mainstream frameworks.
 
 
 - **Industry-Oriented Models and Libraries with Open Source Repositories**
 
@@ -1,4 +1,4 @@
-
+
 <p align="center">
 <img align="center" src="doc/imgs/logo.png", width=1600>
 <p>
@@ -47,10 +47,9 @@ PaddlePaddle用户可领取**免费Tesla V100在线算力资源**，训练模型
     [查看详情](https://github.com/PaddlePaddle/Fleet)
 
 
-- **多端多平台部署的高性能推理引擎**
+- **支持多端多平台的高性能推理部署工具**
 
-    飞桨不仅兼容其他开源框架训练的模型，还可以轻松地部署到不同架构的平台设备上。同时，飞桨的推理速度也是全面领先的。尤其经过了跟华为麒麟NPU的软硬一体优化，使得飞桨在NPU上的推理速度进一步突破。
-    [查看详情](https://github.com/PaddlePaddle/Paddle-Lite)
+    飞桨不仅广泛兼容第三方开源框架训练的模型部署，并且为不同的场景的生产环境提供了完备的推理引擎，包括适用于高性能服务器及云端推理的原生推理库 [Paddle Inference](https://paddle-inference.readthedocs.io/en/latest/product_introduction/summary.html)，面向分布式、流水线生产环境下自动上云、A/B测试等高阶功能的服务化推理框架 [Paddle Serving](https://github.com/PaddlePaddle/Serving)，针对于移动端、物联网场景的轻量化推理引擎 [Paddle Lite](https://github.com/PaddlePaddle/Paddle-Lite)，以及在浏览器、小程序等环境下使用的前端推理引擎 [Paddle.js](https://www.paddlepaddle.org.cn/paddle/paddlejs)。同时，透过与不同场景下的主流硬件高度适配优化及异构计算的支持, 飞桨的推理性能也领先绝大部分的主流实现。
 
 
 - **面向产业应用，开源开放覆盖多领域的工业级模型库。**
 
@@ -73,7 +73,7 @@ if(NOT DEFINED CBLAS_PROVIDER)
     string(REGEX MATCH "OpenBLAS ([0-9]+\.[0-9]+\.[0-9]+)" tmp ${config_file})
     string(REGEX MATCH "([0-9]+\.[0-9]+\.[0-9]+)" ver ${tmp})
 
-    if (${ver} VERSION_EQUAL "0.3.7")
+    if (${ver} VERSION_GREATER_EQUAL "0.3.7")
       set(CBLAS_PROVIDER OPENBLAS)
       set(CBLAS_INC_DIR ${OPENBLAS_INC_DIR} ${OPENBLAS_LAPACKE_INC_DIR})
       set(CBLAS_LIBRARIES ${OPENBLAS_LIB})
 
@@ -117,7 +117,7 @@ InputOrOutputCompat& InputOrOutputCompat::IsOptional() {
 
 bool InputOrOutputCompat::operator()(
     const std::vector<std::string>& input) const {
-  if (input.empty()) return false;
+  if (input.empty()) return optional_;
   for (auto& func : conditions_) {
     if (!func(input)) {
       return false;
 
@@ -129,6 +129,11 @@ void SkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const {
       return;
     }
 
+    if (!IsCompat(subgraph, graph)) {
+      LOG(WARNING) << "skip_layernorm pass in op compat failed.";
+      return;
+    }
+
     VLOG(4) << "handle SkipLayerNorm fuse";
     GET_IR_NODE_FROM_SUBGRAPH(elementwise, elementwise, fused_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out, fused_pattern);
 
@@ -33,6 +33,49 @@ class Graph;
 
 class SkipLayerNormFusePass : public FusePassBase {
  public:
+  SkipLayerNormFusePass() {
+    AddOpCompat(OpCompat("elementwise_add"))
+        .AddInput("X")
+        .IsTensor()
+        .End()
+        .AddInput("Y")
+        .IsTensor()
+        .End()
+        .AddOutput("Out")
+        .IsTensor()
+        .End()
+        .AddAttr("axis")
+        .IsIntIn({0, -1})
+        .End();
+
+    AddOpCompat(OpCompat("layer_norm"))
+        .AddInput("X")
+        .IsTensor()
+        .End()
+        .AddInput("Scale")
+        .IsTensor()
+        .End()
+        .AddInput("Bias")
+        .IsTensor()
+        .End()
+        .AddOutput("Y")
+        .IsTensor()
+        .End()
+        .AddOutput("Mean")
+        .IsTensor()
+        .End()
+        .AddOutput("Variance")
+        .IsTensor()
+        .End()
+        .AddAttr("epsilon")
+        .IsNumGE(0.0f)
+        .IsNumLE(0.001f)
+        .End()
+        .AddAttr("begin_norm_axis")
+        .IsNumGT(0)
+        .End();
+  }
+
   virtual ~SkipLayerNormFusePass() {}
 
  protected:
 
@@ -36,6 +36,8 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
     framework::OpDesc op_desc(op, nullptr);
     auto word_id_name = op_desc.Input("WordId").front();
     auto pos_id_name = op_desc.Input("PosId").front();
+    engine_->Set("ernie_pos_name", new std::string(pos_id_name));
+
     auto sent_id_name = op_desc.Input("SentId").front();
     auto word_emb_name = op_desc.Input("WordEmbedding").front();
     auto pos_emb_name = op_desc.Input("PosEmbedding").front();
 
@@ -48,6 +48,7 @@ class FcOpConverter : public OpConverter {
     }
     // Declare inputs
     auto* X = engine_->GetITensor(op_desc.Input(i_name).front());
+    auto x_dim = X->getDimensions();
     // Declare weights
     auto* Y_v = scope.FindVar(op_desc.Input(w_name).front());
     PADDLE_ENFORCE_NOT_NULL(
@@ -138,7 +139,13 @@ class FcOpConverter : public OpConverter {
             ("fc_layer_before(Output: " + output_name + ")").c_str());
         // add shuffle after fc
         nvinfer1::Dims reshape_after_fc_dim;
-        reshape_after_fc_dim.nbDims = x_num_col_dims + 1;
+        if (engine_->use_oss() && engine_->with_ernie() && x_dim.nbDims == 4 &&
+            x_dim.d[2] == 1 && x_dim.d[3] == 1 && x_num_col_dims == 1) {
+          // If use tensorrt'oss, the x_dim and x_num_col_dims need change
+          reshape_after_fc_dim.nbDims = 4;
+        } else {
+          reshape_after_fc_dim.nbDims = x_num_col_dims + 1;
+        }
         for (int i = 0; i < reshape_after_fc_dim.nbDims; i++) {
           reshape_after_fc_dim.d[i] = 0;
         }
@@ -181,11 +188,15 @@ class FcOpConverter : public OpConverter {
                                 static_cast<void*>(bias_data),
                                 static_cast<size_t>(bias_num)};
 
-    auto x_dim = X->getDimensions();
     // Running the TRT Static Shape mode: x_num_col_dims-1
     if (!engine_->with_dynamic_shape()) {
       x_num_col_dims--;
     }
+    // If use tensorrt'oss, the x_dim and x_num_col_dims need change
+    if (engine_->use_oss() && engine_->with_ernie() && x_dim.nbDims == 4 &&
+        x_dim.d[2] == 1 && x_dim.d[3] == 1 && x_num_col_dims == 2) {
+      x_num_col_dims = 1;
+    }
     PADDLE_ENFORCE_GT(
         x_dim.nbDims, x_num_col_dims,
         platform::errors::InvalidArgument(
 
@@ -191,9 +191,15 @@ class MultiheadMatMulOpConverter : public OpConverter {
         std::vector<nvinfer1::ITensor*> plugin_inputs;
         plugin_inputs.emplace_back(fc_layer->getOutput(0));
         plugin_inputs.emplace_back(mask_tensor);
-        plugin_inputs.emplace_back(engine_->GetITensor(
-            engine_->network()->getInput(2)->getName()));  // cu_seqlens,
-                                                           // eval_placeholder_2
+        if (engine_->Has("ernie_pos_name")) {
+          plugin_inputs.emplace_back(
+              engine_->GetITensor(engine_->Get<std::string>("ernie_pos_name")));
+        } else {
+          plugin_inputs.emplace_back(engine_->GetITensor(
+              engine_->network()
+                  ->getInput(2)
+                  ->getName()));  // cu_seqlens, eval_placeholder_2
+        }
         auto max_seqlen_tensor =
             engine_->GetITensor(engine_->network()->getInput(3)->getName());
         auto* shuffle_layer = TRT_ENGINE_ADD_LAYER(