@@ -16,6 +16,7 @@ limitations under the Licnse. */
1616#include < string>
1717
1818#include " paddle/fluid/framework/ddim.h"
19+ #include " paddle/fluid/framework/framework.pb.h"
1920#include " paddle/fluid/framework/tensor_util.h"
2021#include " paddle/fluid/operators/activation_op.h"
2122#include " paddle/fluid/operators/npu_op_runner.h"
@@ -388,6 +389,155 @@ class SigmoidGradNPUKernel : public framework::OpKernel<T> {
388389 }
389390};
390391
392+ // HardSwish = min(max(0, x+offset), threshold) * x / scale
393+ template <typename T>
394+ class HardSwishNPUKernel : public framework ::OpKernel<T> {
395+ public:
396+ void Compute (const framework::ExecutionContext& ctx) const override {
397+ auto * x = ctx.Input <Tensor>(" X" );
398+ auto * out = ctx.Output <Tensor>(" Out" );
399+
400+ float threshold = ctx.Attr <float >(" threshold" );
401+ float scale = ctx.Attr <float >(" scale" );
402+ float offset = ctx.Attr <float >(" offset" );
403+
404+ auto place = ctx.GetPlace ();
405+
406+ out->mutable_data <T>(place);
407+
408+ auto stream =
409+ ctx.template device_context <paddle::platform::NPUDeviceContext>()
410+ .stream ();
411+
412+ Tensor tensor_offset (x->type ());
413+ tensor_offset.mutable_data <T>({1 }, place);
414+ FillNpuTensorWithConstant<T>(&tensor_offset, static_cast <T>(offset));
415+
416+ Tensor add_offset_val (x->type ());
417+ add_offset_val.mutable_data <T>(x->dims (), place);
418+ const auto & runner_add =
419+ NpuOpRunner (" AddV2" , {*x, tensor_offset}, {add_offset_val});
420+ runner_add.Run (stream);
421+
422+ Tensor tensor_threshold (x->type ());
423+ tensor_threshold.mutable_data <T>({1 }, place);
424+ FillNpuTensorWithConstant<T>(&tensor_threshold, static_cast <T>(threshold));
425+
426+ Tensor tensor_zero (x->type ());
427+ tensor_zero.mutable_data <T>({1 }, place);
428+ FillNpuTensorWithConstant<T>(&tensor_zero, static_cast <T>(0.0 ));
429+
430+ Tensor clip_val (x->type ());
431+ clip_val.mutable_data <T>(x->dims (), place);
432+ const auto & runner_clip = NpuOpRunner (
433+ " ClipByValue" , {add_offset_val, tensor_zero, tensor_threshold},
434+ {clip_val});
435+ runner_clip.Run (stream);
436+
437+ Tensor tensor_scale_tmp (x->type ());
438+ tensor_scale_tmp.mutable_data <T>({1 }, place);
439+ FillNpuTensorWithConstant<T>(&tensor_scale_tmp, static_cast <T>(scale));
440+ Tensor tensor_scale (x->type ());
441+ tensor_scale.mutable_data <T>(x->dims (), place);
442+ const auto & runner_fill =
443+ NpuOpRunner (" FillD" , {tensor_scale_tmp}, {tensor_scale},
444+ {{" dims" , framework::vectorize (x->dims ())}});
445+ runner_fill.Run (stream);
446+
447+ Tensor div_val (x->type ());
448+ div_val.mutable_data <T>(x->dims (), place);
449+ const auto & runner_div =
450+ NpuOpRunner (" Div" , {clip_val, tensor_scale}, {div_val});
451+ runner_div.Run (stream);
452+
453+ const auto & runner_mul = NpuOpRunner (" Mul" , {*x, div_val}, {*out});
454+ runner_mul.Run (stream);
455+ }
456+ };
457+
458+ template <typename T>
459+ class HardSwishGradNPUKernel : public framework ::OpKernel<T> {
460+ public:
461+ void Compute (const framework::ExecutionContext& ctx) const override {
462+ auto * x = ctx.Input <Tensor>(" X" );
463+ auto * dout = ctx.Input <Tensor>(framework::GradVarName (" Out" ));
464+ auto * dx = ctx.Output <Tensor>(framework::GradVarName (" X" ));
465+
466+ float threshold = ctx.Attr <float >(" threshold" );
467+ float scale = ctx.Attr <float >(" scale" );
468+ float offset = ctx.Attr <float >(" offset" );
469+
470+ auto place = ctx.GetPlace ();
471+
472+ dx->mutable_data <T>(place);
473+
474+ auto stream =
475+ ctx.template device_context <paddle::platform::NPUDeviceContext>()
476+ .stream ();
477+
478+ Tensor tensor_offset (x->type ());
479+ tensor_offset.mutable_data <T>({1 }, place);
480+ FillNpuTensorWithConstant<T>(&tensor_offset, static_cast <T>(offset));
481+
482+ Tensor add_offset_val (x->type ());
483+ add_offset_val.mutable_data <T>(x->dims (), place);
484+ const auto & runner_add =
485+ NpuOpRunner (" AddV2" , {*x, tensor_offset}, {add_offset_val});
486+ runner_add.Run (stream);
487+
488+ Tensor tmp1 (x->type ());
489+ tmp1.mutable_data <T>(x->dims (), place);
490+ const auto & runner_pow1 = NpuOpRunner (" Power" , {*x}, {tmp1},
491+ {{" scale" , 2 .0f }, {" shift" , offset}});
492+ runner_pow1.Run (stream);
493+
494+ Tensor tmp2 (x->type ());
495+ tmp2.mutable_data <T>(x->dims (), place);
496+ const auto & runner_ht_grad =
497+ NpuOpRunner (" HardtanhGrad" , {add_offset_val, tmp1}, {tmp2},
498+ {{" min_val" , 0 .0f }, {" max_val" , threshold}});
499+ runner_ht_grad.Run (stream);
500+
501+ Tensor tmp3 (x->type ());
502+ tmp3.mutable_data <T>(x->dims (), place);
503+ const auto & runner_pow2 = NpuOpRunner (
504+ " Power" , {tmp2}, {tmp3}, {{" scale" , 1 .0f / scale}, {" shift" , 1 .0f }});
505+ runner_pow2.Run (stream);
506+
507+ Tensor tensor_threshold_tmp (x->type ());
508+ tensor_threshold_tmp.mutable_data <T>({1 }, place);
509+ FillNpuTensorWithConstant<T>(&tensor_threshold_tmp,
510+ static_cast <T>(threshold));
511+ Tensor tensor_threshold (x->type ());
512+ tensor_threshold.mutable_data <T>(x->dims (), place);
513+ const auto & runner_fill =
514+ NpuOpRunner (" FillD" , {tensor_threshold_tmp}, {tensor_threshold},
515+ {{" dims" , framework::vectorize (x->dims ())}});
516+ runner_fill.Run (stream);
517+
518+ Tensor tmp_bool (framework::proto::VarType::BOOL);
519+ tmp_bool.mutable_data <bool >(x->dims (), place);
520+ const auto & runner_less =
521+ NpuOpRunner (" Less" , {add_offset_val, tensor_threshold}, {tmp_bool});
522+ runner_less.Run (stream);
523+ Tensor tmp4 (x->type ());
524+ tmp4.mutable_data <T>(x->dims (), place);
525+ auto dst_dtype = ConvertToNpuDtype (x->type ());
526+ const auto & runner_cast =
527+ NpuOpRunner (" Cast" , {tmp_bool}, {tmp4},
528+ {{" dst_type" , static_cast <int >(dst_dtype)}});
529+ runner_cast.Run (stream);
530+
531+ Tensor tmp5 (x->type ());
532+ tmp5.mutable_data <T>(x->dims (), place);
533+ const auto & runner_sub = NpuOpRunner (" Sub" , {tmp3, tmp4}, {tmp5});
534+ runner_sub.Run (stream);
535+
536+ const auto & runner_final = NpuOpRunner (" Mul" , {tmp5, *dout}, {*dx});
537+ runner_final.Run (stream);
538+ }
539+ };
540+
391541template <typename DeviceContext, typename T>
392542class HardSigmoidNPUKernel : public framework ::OpKernel<T> {
393543 public:
@@ -677,6 +827,12 @@ REGISTER_OP_NPU_KERNEL(
677827 ops::SigmoidGradNPUKernel<paddle::platform::NPUDeviceContext,
678828 paddle::platform::float16>);
679829
830+ REGISTER_OP_NPU_KERNEL (hard_swish, ops::HardSwishNPUKernel<float >,
831+ ops::HardSwishNPUKernel<paddle::platform::float16>);
832+
833+ REGISTER_OP_NPU_KERNEL (hard_swish_grad, ops::HardSwishGradNPUKernel<float >,
834+ ops::HardSwishGradNPUKernel<paddle::platform::float16>);
835+
680836REGISTER_OP_NPU_KERNEL (
681837 hard_sigmoid,
682838 ops::HardSigmoidNPUKernel<paddle::platform::NPUDeviceContext, float >,
0 commit comments