@@ -1118,6 +1118,50 @@ void mish(const float* din, float* dout, int size, float threshold) {
11181118 dout[i] = x * std::tanh (sp);
11191119 }
11201120}
1121+
1122+ template <>
1123+ void act_silu<float >(const float * din, float * dout, int size, int threads) {
1124+ int nums_per_thread = size / threads;
1125+ int remain = size - threads * nums_per_thread;
1126+ int neon_loop_cnt_dim4 = nums_per_thread >> 2 ;
1127+ int neon_loop_remain_dim4 = nums_per_thread - (neon_loop_cnt_dim4 << 2 );
1128+
1129+ // float32x4_t vzero = vdupq_n_f32(0.f);
1130+ LITE_PARALLEL_BEGIN (i, tid, threads) {
1131+ float32x4_t x_vec = vdupq_n_f32 (0 .0f );
1132+ float32x4_t exp_vec = vdupq_n_f32 (0 .0f );
1133+ float32x4_t recip = vdupq_n_f32 (0 .0f );
1134+ const float * ptr_in_thread = din + i * nums_per_thread;
1135+ float * ptr_out_thread = dout + i * nums_per_thread;
1136+ for (int k = 0 ; k < neon_loop_cnt_dim4; ++k) {
1137+ x_vec = vld1q_f32 (ptr_in_thread);
1138+ exp_vec = exp_ps (vnegq_f32 (x_vec));
1139+ exp_vec = vaddq_f32 (exp_vec, vdupq_n_f32 (1 .0f ));
1140+ recip = vrecpeq_f32 (exp_vec);
1141+ // Using Newton-Raphson step for finding the reciprocal
1142+ recip = vmulq_f32 (vrecpsq_f32 (exp_vec, recip), recip);
1143+ recip = vmulq_f32 (vrecpsq_f32 (exp_vec, recip), recip);
1144+ recip = vmulq_f32 (x_vec, recip);
1145+ vst1q_f32 (ptr_out_thread, recip);
1146+ ptr_out_thread += 4 ;
1147+ ptr_in_thread += 4 ;
1148+ }
1149+ for (int j = 0 ; j < neon_loop_remain_dim4; ++j) {
1150+ ptr_out_thread[0 ] = ptr_in_thread[0 ] / (1 + expf (-ptr_in_thread[0 ]));
1151+ ptr_in_thread++;
1152+ ptr_out_thread++;
1153+ }
1154+ }
1155+ LITE_PARALLEL_END ();
1156+ float * ptr_out = dout + threads * nums_per_thread;
1157+ const float * ptr_in = din + threads * nums_per_thread;
1158+ for (int j = 0 ; j < remain; ++j) {
1159+ ptr_out[0 ] = ptr_in[0 ] / (1 + expf (-ptr_in[0 ]));
1160+ ptr_in++;
1161+ ptr_out++;
1162+ }
1163+ }
1164+
11211165} // namespace math
11221166} // namespace arm
11231167} // namespace lite
0 commit comments