PaddlePaddle
diff --git a/‎paddle/fluid/operators/activation_op_npu.cc‎
Lines changed: 156 additions & 0 deletions b/‎paddle/fluid/operators/activation_op_npu.cc‎
Lines changed: 156 additions & 0 deletions
diff --git a/‎paddle/fluid/operators/broadcast_tensors_op.cc‎
Lines changed: 9 additions & 0 deletions b/‎paddle/fluid/operators/broadcast_tensors_op.cc‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎paddle/fluid/operators/dist_op.cc‎
Lines changed: 14 additions & 0 deletions b/‎paddle/fluid/operators/dist_op.cc‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎paddle/fluid/operators/increment_op_npu.cc‎
Lines changed: 0 additions & 1 deletion b/‎paddle/fluid/operators/increment_op_npu.cc‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎paddle/scripts/paddle_build.sh‎
Lines changed: 36 additions & 6 deletions b/‎paddle/scripts/paddle_build.sh‎
Lines changed: 36 additions & 6 deletions
diff --git a/‎python/paddle/fluid/layers/nn.py‎
Lines changed: 20 additions & 5 deletions b/‎python/paddle/fluid/layers/nn.py‎
Lines changed: 20 additions & 5 deletions
@@ -16,6 +16,7 @@ limitations under the Licnse. */
 #include <string>
 
 #include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/operators/npu_op_runner.h"
@@ -388,6 +389,155 @@ class SigmoidGradNPUKernel : public framework::OpKernel<T> {
   }
 };
 
+// HardSwish = min(max(0, x+offset), threshold) * x / scale
+template <typename T>
+class HardSwishNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* out = ctx.Output<Tensor>("Out");
+
+    float threshold = ctx.Attr<float>("threshold");
+    float scale = ctx.Attr<float>("scale");
+    float offset = ctx.Attr<float>("offset");
+
+    auto place = ctx.GetPlace();
+
+    out->mutable_data<T>(place);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    Tensor tensor_offset(x->type());
+    tensor_offset.mutable_data<T>({1}, place);
+    FillNpuTensorWithConstant<T>(&tensor_offset, static_cast<T>(offset));
+
+    Tensor add_offset_val(x->type());
+    add_offset_val.mutable_data<T>(x->dims(), place);
+    const auto& runner_add =
+        NpuOpRunner("AddV2", {*x, tensor_offset}, {add_offset_val});
+    runner_add.Run(stream);
+
+    Tensor tensor_threshold(x->type());
+    tensor_threshold.mutable_data<T>({1}, place);
+    FillNpuTensorWithConstant<T>(&tensor_threshold, static_cast<T>(threshold));
+
+    Tensor tensor_zero(x->type());
+    tensor_zero.mutable_data<T>({1}, place);
+    FillNpuTensorWithConstant<T>(&tensor_zero, static_cast<T>(0.0));
+
+    Tensor clip_val(x->type());
+    clip_val.mutable_data<T>(x->dims(), place);
+    const auto& runner_clip = NpuOpRunner(
+        "ClipByValue", {add_offset_val, tensor_zero, tensor_threshold},
+        {clip_val});
+    runner_clip.Run(stream);
+
+    Tensor tensor_scale_tmp(x->type());
+    tensor_scale_tmp.mutable_data<T>({1}, place);
+    FillNpuTensorWithConstant<T>(&tensor_scale_tmp, static_cast<T>(scale));
+    Tensor tensor_scale(x->type());
+    tensor_scale.mutable_data<T>(x->dims(), place);
+    const auto& runner_fill =
+        NpuOpRunner("FillD", {tensor_scale_tmp}, {tensor_scale},
+                    {{"dims", framework::vectorize(x->dims())}});
+    runner_fill.Run(stream);
+
+    Tensor div_val(x->type());
+    div_val.mutable_data<T>(x->dims(), place);
+    const auto& runner_div =
+        NpuOpRunner("Div", {clip_val, tensor_scale}, {div_val});
+    runner_div.Run(stream);
+
+    const auto& runner_mul = NpuOpRunner("Mul", {*x, div_val}, {*out});
+    runner_mul.Run(stream);
+  }
+};
+
+template <typename T>
+class HardSwishGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    float threshold = ctx.Attr<float>("threshold");
+    float scale = ctx.Attr<float>("scale");
+    float offset = ctx.Attr<float>("offset");
+
+    auto place = ctx.GetPlace();
+
+    dx->mutable_data<T>(place);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    Tensor tensor_offset(x->type());
+    tensor_offset.mutable_data<T>({1}, place);
+    FillNpuTensorWithConstant<T>(&tensor_offset, static_cast<T>(offset));
+
+    Tensor add_offset_val(x->type());
+    add_offset_val.mutable_data<T>(x->dims(), place);
+    const auto& runner_add =
+        NpuOpRunner("AddV2", {*x, tensor_offset}, {add_offset_val});
+    runner_add.Run(stream);
+
+    Tensor tmp1(x->type());
+    tmp1.mutable_data<T>(x->dims(), place);
+    const auto& runner_pow1 = NpuOpRunner("Power", {*x}, {tmp1},
+                                          {{"scale", 2.0f}, {"shift", offset}});
+    runner_pow1.Run(stream);
+
+    Tensor tmp2(x->type());
+    tmp2.mutable_data<T>(x->dims(), place);
+    const auto& runner_ht_grad =
+        NpuOpRunner("HardtanhGrad", {add_offset_val, tmp1}, {tmp2},
+                    {{"min_val", 0.0f}, {"max_val", threshold}});
+    runner_ht_grad.Run(stream);
+
+    Tensor tmp3(x->type());
+    tmp3.mutable_data<T>(x->dims(), place);
+    const auto& runner_pow2 = NpuOpRunner(
+        "Power", {tmp2}, {tmp3}, {{"scale", 1.0f / scale}, {"shift", 1.0f}});
+    runner_pow2.Run(stream);
+
+    Tensor tensor_threshold_tmp(x->type());
+    tensor_threshold_tmp.mutable_data<T>({1}, place);
+    FillNpuTensorWithConstant<T>(&tensor_threshold_tmp,
+                                 static_cast<T>(threshold));
+    Tensor tensor_threshold(x->type());
+    tensor_threshold.mutable_data<T>(x->dims(), place);
+    const auto& runner_fill =
+        NpuOpRunner("FillD", {tensor_threshold_tmp}, {tensor_threshold},
+                    {{"dims", framework::vectorize(x->dims())}});
+    runner_fill.Run(stream);
+
+    Tensor tmp_bool(framework::proto::VarType::BOOL);
+    tmp_bool.mutable_data<bool>(x->dims(), place);
+    const auto& runner_less =
+        NpuOpRunner("Less", {add_offset_val, tensor_threshold}, {tmp_bool});
+    runner_less.Run(stream);
+    Tensor tmp4(x->type());
+    tmp4.mutable_data<T>(x->dims(), place);
+    auto dst_dtype = ConvertToNpuDtype(x->type());
+    const auto& runner_cast =
+        NpuOpRunner("Cast", {tmp_bool}, {tmp4},
+                    {{"dst_type", static_cast<int>(dst_dtype)}});
+    runner_cast.Run(stream);
+
+    Tensor tmp5(x->type());
+    tmp5.mutable_data<T>(x->dims(), place);
+    const auto& runner_sub = NpuOpRunner("Sub", {tmp3, tmp4}, {tmp5});
+    runner_sub.Run(stream);
+
+    const auto& runner_final = NpuOpRunner("Mul", {tmp5, *dout}, {*dx});
+    runner_final.Run(stream);
+  }
+};
+
 template <typename DeviceContext, typename T>
 class HardSigmoidNPUKernel : public framework::OpKernel<T> {
  public:
@@ -677,6 +827,12 @@ REGISTER_OP_NPU_KERNEL(
     ops::SigmoidGradNPUKernel<paddle::platform::NPUDeviceContext,
                               paddle::platform::float16>);
 
+REGISTER_OP_NPU_KERNEL(hard_swish, ops::HardSwishNPUKernel<float>,
+                       ops::HardSwishNPUKernel<paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(hard_swish_grad, ops::HardSwishGradNPUKernel<float>,
+                       ops::HardSwishGradNPUKernel<paddle::platform::float16>);
+
 REGISTER_OP_NPU_KERNEL(
     hard_sigmoid,
     ops::HardSigmoidNPUKernel<paddle::platform::NPUDeviceContext, float>,
 
@@ -38,6 +38,7 @@ class BroadcastTensorsOp : public framework::OperatorWithKernel {
 
     int target_rank = 0;
     const auto& input_dims = ctx->GetInputsDim("X");
+
     // 1. Find Output rank = max(Inputs rank)
     for (const auto& input_ddim : input_dims) {
       target_rank = std::max(target_rank, input_ddim.size());
@@ -64,6 +65,14 @@ class BroadcastTensorsOp : public framework::OperatorWithKernel {
           dim_size = input_ddim[axis];
         }
 
+        if (target_dim_size != 1 && dim_size != 1 &&
+            target_dim_size != dim_size) {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "BroadcastTensorsOp inputs does not satisfy bcast semantics,"
+              "Please check axis = %d in reverse order",
+              index));
+        }
+
         // We performed bcast semantics check at python level
         // So input tensors should all have legal shape
         target_dim_size = std::max(target_dim_size, dim_size);
 
@@ -27,6 +27,20 @@ class DistOp : public framework::OperatorWithKernel {
     OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Dist");
     OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "Dist");
     OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Dist");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+
+    PADDLE_ENFORCE_NE(framework::product(x_dims), 0,
+                      platform::errors::InvalidArgument(
+                          "The Input(X) has not been initialized properly. The "
+                          "shape of Input(X) = [%s].",
+                          x_dims));
+    PADDLE_ENFORCE_NE(framework::product(y_dims), 0,
+                      platform::errors::InvalidArgument(
+                          "The Input(Y) has not been initialized properly. The "
+                          "shape of Input(Y) = [%s].",
+                          y_dims));
     ctx->SetOutputDim("Out", {1});
   }
 };
 
@@ -64,6 +64,5 @@ REGISTER_OP_NPU_KERNEL(
     ops::IncrementalNPUKernel<paddle::platform::NPUDeviceContext, float>,
     ops::IncrementalNPUKernel<paddle::platform::NPUDeviceContext, double>,
     ops::IncrementalNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::IncrementalNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
     ops::IncrementalNPUKernel<paddle::platform::NPUDeviceContext,
                               plat::float16>)
@@ -689,18 +689,18 @@ function get_precision_ut_mac() {
         on_precision=1
         re=$(cat ut_list|awk -F ' ' '{print }' | awk 'BEGIN{ all_str=""}{if (all_str==""){all_str=$1}else{all_str=all_str"$|^"$1}} END{print "^"all_str"$"}')
         UT_list_prec_1='ut_list_prec2'
-        for case in $UT_list; do
-            flag=$(echo $case|grep -oE $re)
+        for ut_case in $UT_list; do
+            flag=$(echo $ut_case|grep -oE $re)
             if [ -n "$flag" ];then
                 if [ -z "$UT_list_prec" ];then
-                    UT_list_prec="^$case$"
+                    UT_list_prec="^$ut_case$"
                 elif [[ "${#UT_list_prec}" -gt 10000 ]];then
-                    UT_list_prec_1="$UT_list_prec_1|^$case$"
+                    UT_list_prec_1="$UT_list_prec_1|^$ut_case$"
                 else
-                    UT_list_prec="$UT_list_prec|^$case$"
+                    UT_list_prec="$UT_list_prec|^$ut_case$"
                 fi
             else
-                echo ${case} "won't run in PRECISION_TEST mode."
+                echo ${ut_case} "won't run in PRECISION_TEST mode."
             fi
         done
     fi
@@ -722,6 +722,32 @@ function fetch_upstream_develop_if_not_exist() {
     fi
 }
 
+function check_whl_size() {
+    if [ ! "${pr_whl_size}" ];then
+        echo "pr whl size not found "         
+        exit 1
+    fi
+
+    set +x
+    dev_whl_size=`du -m ${PADDLE_ROOT}/build/python/dist/*.whl|awk '{print $1}'`
+    echo "dev_whl_size: ${dev_whl_size}"
+
+    whldiffSize=`expr ${pr_whl_size} - ${dev_whl_size}`
+    if [ ${whldiffSize} -gt 10 ] ; then
+       approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
+       APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 22334008 22361972`
+       echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
+       if [ "${APPROVALS}" == "FALSE" ]; then
+           echo "=========================================================================================="
+           echo "This PR make the release paddlepaddle whl size growth exceeds 10 M."
+           echo "Then you must have one RD (jim19930609 (Recommend) or JiabinYang) approval for this PR\n"
+           echo "=========================================================================================="
+           exit 6
+       fi
+    fi
+    set -x
+}
+
 function generate_upstream_develop_api_spec() {
     fetch_upstream_develop_if_not_exist
     cur_branch=`git branch | grep \* | cut -d ' ' -f2`
@@ -730,6 +756,9 @@ function generate_upstream_develop_api_spec() {
     cmake_gen $1
     build $2
     cp ${PADDLE_ROOT}/python/requirements.txt /tmp
+    pr_whl_size=`du -m ${PADDLE_ROOT}/build/python/dist/*.whl|awk '{print $1}'`
+    echo "pr_whl_size: ${pr_whl_size}"
+    
 
     git checkout $cur_branch
     generate_api_spec "$1" "DEV"
@@ -2234,6 +2263,7 @@ function main() {
         example_code=$?
         summary_check_problems $check_style_code $[${example_code_gpu} + ${example_code}] "$check_style_info" "${example_info_gpu}\n${example_info}"
         assert_api_spec_approvals
+        check_whl_size
         ;;
       build)
         cmake_gen ${PYTHON_ABI:-""}
 
@@ -7105,11 +7105,11 @@ def dice_loss(input, label, epsilon=0.00001, name=None):
 
 
     Parameters:
-        input (Tensor): Tensor, rank>=2, shape is :math:`[N_1, N_2, ..., N_D]`, where :math:`N_1` is
-                          the batch_size, :math:`N_D` is 1. It is usually the output predictions of sigmoid activation.
-                          The data type can be float32 or float64.
-        label (Tensor): Tensor, the groud truth with the same rank as input, shape is :math:`[N_1, N_2, ..., N_D]`.
-                          where :math:`N_1` is the batch_size, :math:`N_D` is 1. The data type can be float32 or float64.
+        input (Tensor): Tensor, rank>=2, shape is :math:`[N_1, N_2, ..., N_k, D]`, where :math:`N_1` is
+                          the batch_size, :math:`D` is the number of categories. It is usually the output
+                          predictions of sigmoid activation. The data type can be float32 or float64.
+        label (Tensor): Tensor, the groud truth with the same rank as input, shape is :math:`[N_1, N_2, ..., N_k, 1]`.
+                          where :math:`N_1` is the batch_size. The data type can be int32 or int64.
         epsilon (float): The epsilon will be added to the numerator and denominator.
                          If both input and label are empty, it makes sure dice is 1.
                          Default: 0.00001
@@ -7131,6 +7131,21 @@ def dice_loss(input, label, epsilon=0.00001, name=None):
             predictions = F.softmax(x)
             loss = F.dice_loss(input=predictions, label=label)
     """
+    assert input.dtype in (paddle.float32, paddle.float64)
+    assert label.dtype in (paddle.int32, paddle.int64)
+    assert len(input.shape) >= 2, \
+        "The rank of input should be greater than or equal to 2."
+    assert len(input.shape) == len(label.shape), (
+        "The rank of input and label should be equal, "
+        "but received input: %d, label: %d." %
+        (len(input.shape), len(label.shape)))
+    assert label.shape[-1] == 1, ("The last dimension of label should be 1, "
+                                  "but received %d." % label.shape[-1])
+    assert input.shape[:-1] == label.shape[:-1], (
+        "All dimensions should be equal except the last one.")
+    assert input.numel() > 0 and label.numel() > 0, \
+        "Any dimension of input and label cannot be equal to 0."
+
     label = one_hot(label, depth=input.shape[-1])
     reduce_dim = list(range(1, len(input.shape)))
     inse = reduce_sum(input * label, dim=reduce_dim)