diff --git a/lite/backends/arm/math/sparse_semi_conv_impl.cc b/lite/backends/arm/math/sparse_semi_conv_impl.cc
new file mode 100644
index 00000000000..c1ed1c1bf26
--- /dev/null
+++ b/lite/backends/arm/math/sparse_semi_conv_impl.cc
@@ -0,0 +1,11194 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/arm/math/sparse_semi_conv_impl.h"
+#include <arm_neon.h>
+#include <vector>
+#include "lite/core/parallel_defines.h"
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+#ifdef __aarch64__
+
+#define SPARSE_F32_F32_W48_SEMI2_V8_KERNEL  \
+  "ld1     {v0.2s},   [%[bias_ptr]]\n"      \
+  "dup     v8.4s,   v0.s[0]\n"              \
+  "dup     v9.4s,   v0.s[0]\n"              \
+  "prfm  pldl1keep, [%[a_ptr], #128]\n"     \
+  "dup     v10.4s,  v0.s[0]\n"              \
+  "dup     v11.4s,  v0.s[0]\n"              \
+  "prfm  pldl1keep, [%[widx_dmap], #128]\n" \
+  "dup     v12.4s,  v0.s[0]\n"              \
+  "dup     v13.4s,  v0.s[0]\n"              \
+  "dup     v14.4s,  v0.s[0]\n"              \
+  "dup     v15.4s,  v0.s[0]\n"              \
+  "dup     v16.4s,  v0.s[0]\n"              \
+  "dup     v17.4s,  v0.s[0]\n"              \
+  "dup     v18.4s,  v0.s[0]\n"              \
+  "dup     v19.4s,  v0.s[0]\n"              \
+  "dup     v20.4s,  v0.s[1]\n"              \
+  "dup     v21.4s,  v0.s[1]\n"              \
+  "dup     v22.4s,  v0.s[1]\n"              \
+  "dup     v23.4s,  v0.s[1]\n"              \
+  "dup     v24.4s,  v0.s[1]\n"              \
+  "prfm  pldl1keep, [%[b_ptr], #192]\n"     \
+  "dup     v25.4s,  v0.s[1]\n"              \
+  "dup     v26.4s,  v0.s[1]\n"              \
+  "dup     v27.4s,  v0.s[1]\n"              \
+  "dup     v28.4s,  v0.s[1]\n"              \
+  "dup     v29.4s,  v0.s[1]\n"              \
+  "dup     v30.4s,  v0.s[1]\n"              \
+  "dup     v31.4s,  v0.s[1]\n"              \
+  "cbz     %w[k],    1f\n" /* main loop*/   \
+  "0:\n"                                    \
+  "ld1   {v0.2s}, [%[a_ptr]], #8\n"         \
+  "subs    %w[k],   %w[k],   #1\n"          \
+  "ldr   w1, [%[widx_dmap]],   #4\n"        \
+  "sxtw  x1,  w1\n"                         \
+  "ldp   q1, q2, [%[b_ptr]]\n"              \
+  "fmla    v8.4s,   v1.4s,  v0.s[0]\n"      \
+  "fmla    v9.4s,   v2.4s,  v0.s[0]\n"      \
+  "fmla    v20.4s,  v1.4s,  v0.s[1]\n"      \
+  "fmla    v21.4s,  v2.4s,  v0.s[1]\n"      \
+  "ldp   q3, q4, [%[b_ptr], #32]\n"         \
+  "fmla    v10.4s,  v3.4s,  v0.s[0]\n"      \
+  "fmla    v11.4s,  v4.4s,  v0.s[0]\n"      \
+  "fmla    v22.4s,  v3.4s,  v0.s[1]\n"      \
+  "fmla    v23.4s,  v4.4s,  v0.s[1]\n"      \
+  "ldp   q5, q6, [%[b_ptr], #64]\n"         \
+  "fmla    v12.4s,  v5.4s,  v0.s[0]\n"      \
+  "fmla    v13.4s,  v6.4s,  v0.s[0]\n"      \
+  "prfm  pldl1keep, [%[a_ptr], #128]\n"     \
+  "fmla    v24.4s,  v5.4s,  v0.s[1]\n"      \
+  "fmla    v25.4s,  v6.4s,  v0.s[1]\n"      \
+  "ldp   q1, q2, [%[b_ptr], #96]\n"         \
+  "fmla    v14.4s,  v1.4s,  v0.s[0]\n"      \
+  "fmla    v15.4s,  v2.4s,  v0.s[0]\n"      \
+  "ldp   q3, q4, [%[b_ptr], #128]\n"        \
+  "fmla    v26.4s,  v1.4s,  v0.s[1]\n"      \
+  "fmla    v27.4s,  v2.4s,  v0.s[1]\n"      \
+  "ldp   q5, q6, [%[b_ptr], #160]\n"        \
+  "fmla    v16.4s,  v3.4s,  v0.s[0]\n"      \
+  "fmla    v17.4s,  v4.4s,  v0.s[0]\n"      \
+  "add   %[b_ptr],  %[b_ptr], x1\n"         \
+  "fmla    v18.4s,  v5.4s,  v0.s[0]\n"      \
+  "fmla    v19.4s,  v6.4s,  v0.s[0]\n"      \
+  "prfm  pldl1keep, [%[widx_dmap], #128]\n" \
+  "fmla    v28.4s,  v3.4s,  v0.s[1]\n"      \
+  "prfm  pldl1keep, [%[b_ptr], #128]\n"     \
+  "fmla    v29.4s,  v4.4s,  v0.s[1]\n"      \
+  "fmla    v30.4s,  v5.4s,  v0.s[1]\n"      \
+  "fmla    v31.4s,  v6.4s,  v0.s[1]\n"      \
+  "bne     0b\n"                            \
+  "1:\n"
+
+#define SPARSE_F32_F32_W32_SEMI2_V8_KERNEL  \
+  "ld1     {v0.2s},   [%[bias_ptr]]\n"      \
+  "dup     v16.4s,  v0.s[0]\n"              \
+  "dup     v17.4s,  v0.s[0]\n"              \
+  "dup     v18.4s,  v0.s[0]\n"              \
+  "prfm  pldl1keep, [%[a_ptr], #128]\n"     \
+  "dup     v19.4s,  v0.s[0]\n"              \
+  "dup     v20.4s,  v0.s[0]\n"              \
+  "prfm  pldl1keep, [%[widx_dmap], #128]\n" \
+  "dup     v21.4s,  v0.s[0]\n"              \
+  "dup     v22.4s,  v0.s[0]\n"              \
+  "dup     v23.4s,  v0.s[0]\n"              \
+  "dup     v24.4s,  v0.s[1]\n"              \
+  "prfm  pldl1keep, [%[b_ptr], #128]\n"     \
+  "dup     v25.4s,  v0.s[1]\n"              \
+  "dup     v26.4s,  v0.s[1]\n"              \
+  "dup     v27.4s,  v0.s[1]\n"              \
+  "dup     v28.4s,  v0.s[1]\n"              \
+  "dup     v29.4s,  v0.s[1]\n"              \
+  "dup     v30.4s,  v0.s[1]\n"              \
+  "dup     v31.4s,  v0.s[1]\n"              \
+  "cbz     %w[k],    1f\n" /* main loop*/   \
+  "0:\n"                                    \
+  "ld1   {v0.2s}, [%[a_ptr]], #8\n"         \
+  "subs    %w[k],   %w[k],   #1\n"          \
+  "ldr   w1, [%[widx_dmap]],   #4\n"        \
+  "sxtw  x1,  w1\n"                         \
+  "ldp   q1, q2, [%[b_ptr]]\n"              \
+  "ldp   q3, q4, [%[b_ptr], #32]\n"         \
+  "ldp   q5, q6, [%[b_ptr], #64]\n"         \
+  "ldp   q7, q8, [%[b_ptr], #96]\n"         \
+  "fmla    v16.4s,  v1.4s,  v0.s[0]\n"      \
+  "fmla    v17.4s,  v2.4s,  v0.s[0]\n"      \
+  "fmla    v24.4s,  v1.4s,  v0.s[1]\n"      \
+  "fmla    v25.4s,  v2.4s,  v0.s[1]\n"      \
+  "fmla    v18.4s,  v3.4s,  v0.s[0]\n"      \
+  "fmla    v19.4s,  v4.4s,  v0.s[0]\n"      \
+  "fmla    v26.4s,  v3.4s,  v0.s[1]\n"      \
+  "fmla    v27.4s,  v4.4s,  v0.s[1]\n"      \
+  "fmla    v20.4s,  v5.4s,  v0.s[0]\n"      \
+  "fmla    v21.4s,  v6.4s,  v0.s[0]\n"      \
+  "add   %[b_ptr],  %[b_ptr], x1\n"         \
+  "fmla    v28.4s,  v5.4s,  v0.s[1]\n"      \
+  "fmla    v29.4s,  v6.4s,  v0.s[1]\n"      \
+  "fmla    v22.4s,  v7.4s,  v0.s[0]\n"      \
+  "fmla    v23.4s,  v8.4s,  v0.s[0]\n"      \
+  "fmla    v30.4s,  v7.4s,  v0.s[1]\n"      \
+  "fmla    v31.4s,  v8.4s,  v0.s[1]\n"      \
+  "bne     0b\n"                            \
+  "1:\n"
+
+#define SPARSE_F32_F32_W16_SEMI2_V8_KERNEL  \
+  "ld1     {v0.2s},   [%[bias_ptr]]\n"      \
+  "dup     v24.4s,  v0.s[0]\n"              \
+  "dup     v25.4s,  v0.s[0]\n"              \
+  "prfm  pldl1keep, [%[a_ptr], #128]\n"     \
+  "dup     v26.4s,  v0.s[0]\n"              \
+  "dup     v27.4s,  v0.s[0]\n"              \
+  "prfm  pldl1keep, [%[widx_dmap], #128]\n" \
+  "dup     v28.4s,  v0.s[1]\n"              \
+  "dup     v29.4s,  v0.s[1]\n"              \
+  "prfm  pldl1keep, [%[b_ptr], #128]\n"     \
+  "dup     v30.4s,  v0.s[1]\n"              \
+  "dup     v31.4s,  v0.s[1]\n"              \
+  "cbz     %w[k],    1f\n" /* main loop*/   \
+  "0:\n"                                    \
+  "ld1   {v0.2s}, [%[a_ptr]], #8\n"         \
+  "subs    %w[k],   %w[k],   #1\n"          \
+  "ldr   w1, [%[widx_dmap]],   #4\n"        \
+  "sxtw  x1,  w1\n"                         \
+  "ldp   q1, q2, [%[b_ptr]]\n"              \
+  "ldp   q3, q4, [%[b_ptr], #32]\n"         \
+  "fmla    v24.4s,  v1.4s,  v0.s[0]\n"      \
+  "fmla    v25.4s,  v2.4s,  v0.s[0]\n"      \
+  "fmla    v26.4s,  v3.4s,  v0.s[0]\n"      \
+  "fmla    v27.4s,  v4.4s,  v0.s[0]\n"      \
+  "add   %[b_ptr],  %[b_ptr], x1\n"         \
+  "fmla    v28.4s,  v1.4s,  v0.s[1]\n"      \
+  "fmla    v29.4s,  v2.4s,  v0.s[1]\n"      \
+  "fmla    v30.4s,  v3.4s,  v0.s[1]\n"      \
+  "fmla    v31.4s,  v4.4s,  v0.s[1]\n"      \
+  "bne     0b\n"                            \
+  "1:\n"
+
+#define SPARSE_F32_F32_W8_SEMI2_V8_KERNEL \
+  "ld1     {v0.2s},   [%[bias_ptr]]\n"    \
+  "dup     v28.4s,  v0.s[0]\n"            \
+  "dup     v29.4s,  v0.s[0]\n"            \
+  "dup     v30.4s,  v0.s[1]\n"            \
+  "dup     v31.4s,  v0.s[1]\n"            \
+  "cbz     %w[k],    1f\n" /* main loop*/ \
+  "0:\n"                                  \
+  "ld1   {v0.2s}, [%[a_ptr]], #8\n"       \
+  "subs    %w[k],   %w[k],   #1\n"        \
+  "ldr   w1, [%[widx_dmap]],   #4\n"      \
+  "sxtw  x1,  w1\n"                       \
+  "ldp   q1, q2, [%[b_ptr]]\n"            \
+  "fmla    v28.4s,  v1.4s,  v0.s[0]\n"    \
+  "fmla    v29.4s,  v2.4s,  v0.s[0]\n"    \
+  "add   %[b_ptr],  %[b_ptr], x1\n"       \
+  "fmla    v30.4s,  v1.4s,  v0.s[1]\n"    \
+  "fmla    v31.4s,  v2.4s,  v0.s[1]\n"    \
+  "bne     0b\n"                          \
+  "1:\n"
+
+#define SPARSE_F32_F32_W4_SEMI2_V8_KERNEL \
+  "ld1     {v0.2s},   [%[bias_ptr]]\n"    \
+  "dup     v30.4s,  v0.s[0]\n"            \
+  "dup     v31.4s,  v0.s[1]\n"            \
+  "cbz     %w[k],    1f\n" /* main loop*/ \
+  "0:\n"                                  \
+  "ld1   {v0.2s}, [%[a_ptr]], #8\n"       \
+  "subs    %w[k],   %w[k],   #1\n"        \
+  "ldr   w1, [%[widx_dmap]],   #4\n"      \
+  "sxtw  x1,  w1\n"                       \
+  "ldr   q1, [%[b_ptr]]\n"                \
+  "fmla    v30.4s,  v1.4s,  v0.s[0]\n"    \
+  "add   %[b_ptr],  %[b_ptr], x1\n"       \
+  "fmla    v31.4s,  v1.4s,  v0.s[1]\n"    \
+  "bne     0b\n"                          \
+  "1:\n"
+
+#define SPARSE_F32_F32_W48_SEMI1_V8_KERNEL  \
+  "dup     v20.4s,  %w[vbias]\n"            \
+  "dup     v21.4s,  v20.s[0]\n"             \
+  "prfm  pldl1keep, [%[a_ptr], #128]\n"     \
+  "dup     v22.4s,  v20.s[0]\n"             \
+  "dup     v23.4s,  v20.s[0]\n"             \
+  "prfm  pldl1keep, [%[widx_dmap], #128]\n" \
+  "dup     v24.4s,  v20.s[0]\n"             \
+  "dup     v25.4s,  v20.s[0]\n"             \
+  "dup     v26.4s,  v20.s[0]\n"             \
+  "prfm  pldl1keep, [%[b_ptr], #192]\n"     \
+  "dup     v27.4s,  v20.s[0]\n"             \
+  "dup     v28.4s,  v20.s[0]\n"             \
+  "dup     v29.4s,  v20.s[0]\n"             \
+  "dup     v30.4s,  v20.s[0]\n"             \
+  "dup     v31.4s,  v20.s[0]\n"             \
+  "cbz     %w[k],    1f\n" /* main loop*/   \
+  "0:\n"                                    \
+  "ld1   {v0.s}[0], [%[a_ptr]], #4\n"       \
+  "subs    %w[k],   %w[k],   #1\n"          \
+  "ldr   w1, [%[widx_dmap]],   #4\n"        \
+  "sxtw  x1,  w1\n"                         \
+  "ldp   q1, q2, [%[b_ptr]]\n"              \
+  "ldp   q3, q4, [%[b_ptr], #32]\n"         \
+  "ldp   q5, q6, [%[b_ptr], #64]\n"         \
+  "ldp   q7, q8, [%[b_ptr], #96]\n"         \
+  "ldp   q9, q10, [%[b_ptr], #128]\n"       \
+  "ldp   q11, q12, [%[b_ptr], #160]\n"      \
+  "fmla    v20.4s,  v1.4s,  v0.s[0]\n"      \
+  "fmla    v21.4s,  v2.4s,  v0.s[0]\n"      \
+  "fmla    v22.4s,  v3.4s,  v0.s[0]\n"      \
+  "fmla    v23.4s,  v4.4s,  v0.s[0]\n"      \
+  "add   %[b_ptr],  %[b_ptr], x1\n"         \
+  "fmla    v24.4s,  v5.4s,  v0.s[0]\n"      \
+  "fmla    v25.4s,  v6.4s,  v0.s[0]\n"      \
+  "fmla    v26.4s,  v7.4s,  v0.s[0]\n"      \
+  "fmla    v27.4s,  v8.4s,  v0.s[0]\n"      \
+  "fmla    v28.4s,  v9.4s,  v0.s[0]\n"      \
+  "fmla    v29.4s,  v10.4s,  v0.s[0]\n"     \
+  "fmla    v30.4s,  v11.4s,  v0.s[0]\n"     \
+  "fmla    v31.4s,  v12.4s,  v0.s[0]\n"     \
+  "bne     0b\n"                            \
+  "1:\n"
+
+#define SPARSE_F32_F32_W32_SEMI1_V8_KERNEL  \
+  "dup     v20.4s,  %w[vbias]\n"            \
+  "dup     v21.4s,  v20.s[0]\n"             \
+  "prfm  pldl1keep, [%[a_ptr], #128]\n"     \
+  "dup     v22.4s,  v20.s[0]\n"             \
+  "dup     v23.4s,  v20.s[0]\n"             \
+  "prfm  pldl1keep, [%[widx_dmap], #128]\n" \
+  "dup     v24.4s,  v20.s[0]\n"             \
+  "dup     v25.4s,  v20.s[0]\n"             \
+  "dup     v26.4s,  v20.s[0]\n"             \
+  "prfm  pldl1keep, [%[b_ptr], #192]\n"     \
+  "dup     v27.4s,  v20.s[0]\n"             \
+  "cbz     %w[k],    1f\n" /* main loop*/   \
+  "0:\n"                                    \
+  "ld1   {v0.s}[0], [%[a_ptr]], #4\n"       \
+  "subs    %w[k],   %w[k],   #1\n"          \
+  "ldr   w1, [%[widx_dmap]],   #4\n"        \
+  "sxtw  x1,  w1\n"                         \
+  "ldp   q1, q2, [%[b_ptr]]\n"              \
+  "ldp   q3, q4, [%[b_ptr], #32]\n"         \
+  "ldp   q5, q6, [%[b_ptr], #64]\n"         \
+  "ldp   q7, q8, [%[b_ptr], #96]\n"         \
+  "fmla    v20.4s,  v1.4s,  v0.s[0]\n"      \
+  "fmla    v21.4s,  v2.4s,  v0.s[0]\n"      \
+  "fmla    v22.4s,  v3.4s,  v0.s[0]\n"      \
+  "fmla    v23.4s,  v4.4s,  v0.s[0]\n"      \
+  "add   %[b_ptr],  %[b_ptr], x1\n"         \
+  "fmla    v24.4s,  v5.4s,  v0.s[0]\n"      \
+  "fmla    v25.4s,  v6.4s,  v0.s[0]\n"      \
+  "fmla    v26.4s,  v7.4s,  v0.s[0]\n"      \
+  "fmla    v27.4s,  v8.4s,  v0.s[0]\n"      \
+  "bne     0b\n"                            \
+  "1:\n"
+
+#define SPARSE_F32_F32_W16_SEMI1_V8_KERNEL  \
+  "dup     v20.4s,  %w[vbias]\n"            \
+  "prfm  pldl1keep, [%[a_ptr], #128]\n"     \
+  "dup     v21.4s,  v20.s[0]\n"             \
+  "prfm  pldl1keep, [%[widx_dmap], #128]\n" \
+  "dup     v22.4s,  v20.s[0]\n"             \
+  "prfm  pldl1keep, [%[b_ptr], #192]\n"     \
+  "dup     v23.4s,  v20.s[0]\n"             \
+  "cbz     %w[k],    1f\n" /* main loop*/   \
+  "0:\n"                                    \
+  "ld1   {v0.s}[0], [%[a_ptr]], #4\n"       \
+  "subs    %w[k],   %w[k],   #1\n"          \
+  "ldr   w1, [%[widx_dmap]],   #4\n"        \
+  "sxtw  x1,  w1\n"                         \
+  "ldp   q1, q2, [%[b_ptr]]\n"              \
+  "ldp   q3, q4, [%[b_ptr], #32]\n"         \
+  "fmla    v20.4s,  v1.4s,  v0.s[0]\n"      \
+  "fmla    v21.4s,  v2.4s,  v0.s[0]\n"      \
+  "add   %[b_ptr],  %[b_ptr], x1\n"         \
+  "fmla    v22.4s,  v3.4s,  v0.s[0]\n"      \
+  "fmla    v23.4s,  v4.4s,  v0.s[0]\n"      \
+  "bne     0b\n"                            \
+  "1:\n"
+
+#define SPARSE_F32_F32_W8_SEMI1_V8_KERNEL \
+  "dup     v20.4s,  %w[vbias]\n"          \
+  "dup     v21.4s,  v20.s[0]\n"           \
+  "cbz     %w[k],    1f\n" /* main loop*/ \
+  "0:\n"                                  \
+  "ld1   {v0.s}[0], [%[a_ptr]], #4\n"     \
+  "subs    %w[k],   %w[k],   #1\n"        \
+  "ldr   w1, [%[widx_dmap]],   #4\n"      \
+  "sxtw  x1,  w1\n"                       \
+  "ldp   q1, q2, [%[b_ptr]]\n"            \
+  "fmla    v20.4s,  v1.4s,  v0.s[0]\n"    \
+  "add   %[b_ptr],  %[b_ptr], x1\n"       \
+  "fmla    v21.4s,  v2.4s,  v0.s[0]\n"    \
+  "bne     0b\n"                          \
+  "1:\n"
+
+#define SPARSE_F32_F32_W4_SEMI1_V8_KERNEL \
+  "dup     v20.4s,  %w[vbias]\n"          \
+  "cbz     %w[k],    1f\n" /* main loop*/ \
+  "0:\n"                                  \
+  "ld1   {v0.s}[0], [%[a_ptr]], #4\n"     \
+  "subs    %w[k],   %w[k],   #1\n"        \
+  "ldr   w1, [%[widx_dmap]],   #4\n"      \
+  "sxtw  x1,  w1\n"                       \
+  "ldr   q1, [%[b_ptr]]\n"                \
+  "fmla    v20.4s,  v1.4s,  v0.s[0]\n"    \
+  "add   %[b_ptr],  %[b_ptr], x1\n"       \
+  "bne     0b\n"                          \
+  "1:\n"
+
+#define SPARSE_F32_F32_W48_SEMI2_V8_RELU \
+  /* do relu */                          \
+  "cmp    %w[vflag_act],    #0\n"        \
+  "beq   9f                     \n"      \
+  "cmp    %w[vflag_act],    #1\n"        \
+  "bne   10f                     \n"     \
+  "movi   v0.4s, #0\n"                   \
+  "fmax   v8.4s,  v8.4s,  v0.4s\n"       \
+  "fmax   v9.4s,  v9.4s,  v0.4s\n"       \
+  "fmax   v10.4s, v10.4s, v0.4s\n"       \
+  "fmax   v11.4s, v11.4s, v0.4s\n"       \
+  "fmax   v12.4s, v12.4s, v0.4s\n"       \
+  "fmax   v13.4s, v13.4s, v0.4s\n"       \
+  "fmax   v14.4s, v14.4s, v0.4s\n"       \
+  "fmax   v15.4s, v15.4s, v0.4s\n"       \
+  "fmax   v16.4s, v16.4s, v0.4s\n"       \
+  "fmax   v17.4s, v17.4s, v0.4s\n"       \
+  "fmax   v18.4s, v18.4s, v0.4s\n"       \
+  "fmax   v19.4s, v19.4s, v0.4s\n"       \
+  "fmax   v20.4s, v20.4s, v0.4s\n"       \
+  "fmax   v21.4s, v21.4s, v0.4s\n"       \
+  "fmax   v22.4s, v22.4s, v0.4s\n"       \
+  "fmax   v23.4s, v23.4s, v0.4s\n"       \
+  "fmax   v24.4s, v24.4s, v0.4s\n"       \
+  "fmax   v25.4s, v25.4s, v0.4s\n"       \
+  "fmax   v26.4s, v26.4s, v0.4s\n"       \
+  "fmax   v27.4s, v27.4s, v0.4s\n"       \
+  "fmax   v28.4s, v28.4s, v0.4s\n"       \
+  "fmax   v29.4s, v29.4s, v0.4s\n"       \
+  "fmax   v30.4s, v30.4s, v0.4s\n"       \
+  "fmax   v31.4s, v31.4s, v0.4s\n"       \
+  "b      9f                    \n"
+
+#define SPARSE_F32_F32_W32_SEMI2_V8_RELU \
+  /* do relu */                          \
+  "cmp    %w[vflag_act],    #0\n"        \
+  "beq   9f                     \n"      \
+  "cmp    %w[vflag_act],    #1\n"        \
+  "bne   10f                     \n"     \
+  "movi   v0.4s, #0\n"                   \
+  "fmax   v16.4s, v16.4s, v0.4s\n"       \
+  "fmax   v17.4s, v17.4s, v0.4s\n"       \
+  "fmax   v18.4s, v18.4s, v0.4s\n"       \
+  "fmax   v19.4s, v19.4s, v0.4s\n"       \
+  "fmax   v20.4s, v20.4s, v0.4s\n"       \
+  "fmax   v21.4s, v21.4s, v0.4s\n"       \
+  "fmax   v22.4s, v22.4s, v0.4s\n"       \
+  "fmax   v23.4s, v23.4s, v0.4s\n"       \
+  "fmax   v24.4s, v24.4s, v0.4s\n"       \
+  "fmax   v25.4s, v25.4s, v0.4s\n"       \
+  "fmax   v26.4s, v26.4s, v0.4s\n"       \
+  "fmax   v27.4s, v27.4s, v0.4s\n"       \
+  "fmax   v28.4s, v28.4s, v0.4s\n"       \
+  "fmax   v29.4s, v29.4s, v0.4s\n"       \
+  "fmax   v30.4s, v30.4s, v0.4s\n"       \
+  "fmax   v31.4s, v31.4s, v0.4s\n"       \
+  "b      9f                    \n"
+
+#define SPARSE_F32_F32_W16_SEMI2_V8_RELU \
+  /* do relu */                          \
+  "cmp    %w[vflag_act],    #0\n"        \
+  "beq   9f                     \n"      \
+  "cmp    %w[vflag_act],    #1\n"        \
+  "bne   10f                     \n"     \
+  "movi   v0.4s, #0\n"                   \
+  "fmax   v24.4s, v24.4s, v0.4s\n"       \
+  "fmax   v25.4s, v25.4s, v0.4s\n"       \
+  "fmax   v26.4s, v26.4s, v0.4s\n"       \
+  "fmax   v27.4s, v27.4s, v0.4s\n"       \
+  "fmax   v28.4s, v28.4s, v0.4s\n"       \
+  "fmax   v29.4s, v29.4s, v0.4s\n"       \
+  "fmax   v30.4s, v30.4s, v0.4s\n"       \
+  "fmax   v31.4s, v31.4s, v0.4s\n"       \
+  "b      9f                    \n"
+
+#define SPARSE_F32_F32_W8_SEMI2_V8_RELU \
+  /* do relu */                         \
+  "cmp    %w[vflag_act],    #0\n"       \
+  "beq   9f                     \n"     \
+  "cmp    %w[vflag_act],    #1\n"       \
+  "bne   10f                     \n"    \
+  "movi   v0.4s, #0\n"                  \
+  "fmax   v28.4s, v28.4s, v0.4s\n"      \
+  "fmax   v29.4s, v29.4s, v0.4s\n"      \
+  "fmax   v30.4s, v30.4s, v0.4s\n"      \
+  "fmax   v31.4s, v31.4s, v0.4s\n"      \
+  "b      9f                    \n"
+
+#define SPARSE_F32_F32_W4_SEMI2_V8_RELU \
+  /* do relu */                         \
+  "cmp    %w[vflag_act],    #0\n"       \
+  "beq   9f                     \n"     \
+  "cmp    %w[vflag_act],    #1\n"       \
+  "bne   10f                     \n"    \
+  "movi   v0.4s, #0\n"                  \
+  "fmax   v30.4s, v30.4s, v0.4s\n"      \
+  "fmax   v31.4s, v31.4s, v0.4s\n"      \
+  "b      9f                    \n"
+
+#define SPARSE_F32_F32_W48_SEMI1_V8_RELU \
+  /* do relu */                          \
+  "cmp    %w[vflag_act],    #0\n"        \
+  "beq   9f                     \n"      \
+  "cmp    %w[vflag_act],    #1\n"        \
+  "bne   10f                     \n"     \
+  "movi   v0.4s, #0\n"                   \
+  "fmax   v20.4s, v20.4s, v0.4s\n"       \
+  "fmax   v21.4s, v21.4s, v0.4s\n"       \
+  "fmax   v22.4s, v22.4s, v0.4s\n"       \
+  "fmax   v23.4s, v23.4s, v0.4s\n"       \
+  "fmax   v24.4s, v24.4s, v0.4s\n"       \
+  "fmax   v25.4s, v25.4s, v0.4s\n"       \
+  "fmax   v26.4s, v26.4s, v0.4s\n"       \
+  "fmax   v27.4s, v27.4s, v0.4s\n"       \
+  "fmax   v28.4s, v28.4s, v0.4s\n"       \
+  "fmax   v29.4s, v29.4s, v0.4s\n"       \
+  "fmax   v30.4s, v30.4s, v0.4s\n"       \
+  "fmax   v31.4s, v31.4s, v0.4s\n"       \
+  "b      9f                    \n"
+
+#define SPARSE_F32_F32_W32_SEMI1_V8_RELU \
+  /* do relu */                          \
+  "cmp    %w[vflag_act],    #0\n"        \
+  "beq   9f                     \n"      \
+  "cmp    %w[vflag_act],    #1\n"        \
+  "bne   10f                     \n"     \
+  "movi   v0.4s, #0\n"                   \
+  "fmax   v20.4s, v20.4s, v0.4s\n"       \
+  "fmax   v21.4s, v21.4s, v0.4s\n"       \
+  "fmax   v22.4s, v22.4s, v0.4s\n"       \
+  "fmax   v23.4s, v23.4s, v0.4s\n"       \
+  "fmax   v24.4s, v24.4s, v0.4s\n"       \
+  "fmax   v25.4s, v25.4s, v0.4s\n"       \
+  "fmax   v26.4s, v26.4s, v0.4s\n"       \
+  "fmax   v27.4s, v27.4s, v0.4s\n"       \
+  "b      9f                    \n"
+
+#define SPARSE_F32_F32_W16_SEMI1_V8_RELU \
+  /* do relu */                          \
+  "cmp    %w[vflag_act],    #0\n"        \
+  "beq   9f                     \n"      \
+  "cmp    %w[vflag_act],    #1\n"        \
+  "bne   10f                     \n"     \
+  "movi   v0.4s, #0\n"                   \
+  "fmax   v20.4s, v20.4s, v0.4s\n"       \
+  "fmax   v21.4s, v21.4s, v0.4s\n"       \
+  "fmax   v22.4s, v22.4s, v0.4s\n"       \
+  "fmax   v23.4s, v23.4s, v0.4s\n"       \
+  "b      9f                    \n"
+
+#define SPARSE_F32_F32_W8_SEMI1_V8_RELU \
+  /* do relu */                         \
+  "cmp    %w[vflag_act],    #0\n"       \
+  "beq   9f                     \n"     \
+  "cmp    %w[vflag_act],    #1\n"       \
+  "bne   10f                     \n"    \
+  "movi   v0.4s, #0\n"                  \
+  "fmax   v20.4s, v20.4s, v0.4s\n"      \
+  "fmax   v21.4s, v21.4s, v0.4s\n"      \
+  "b      9f                    \n"
+
+#define SPARSE_F32_F32_W4_SEMI1_V8_RELU \
+  /* do relu */                         \
+  "cmp    %w[vflag_act],    #0\n"       \
+  "beq   9f                     \n"     \
+  "cmp    %w[vflag_act],    #1\n"       \
+  "bne   10f                     \n"    \
+  "movi   v0.4s, #0\n"                  \
+  "fmax   v20.4s, v20.4s, v0.4s\n"      \
+  "b      9f                    \n"
+
+#define SPARSE_F32_F32_W48_SEMI2_V8_RELU6 \
+  /* do relu6 */                          \
+  "10: \n"                                \
+  "cmp   %w[vflag_act],  #2       \n"     \
+  "bne   11f                     \n"      \
+  "movi   v0.4s, #0\n"                    \
+  "dup    v1.4s,  %w[valpha]\n"           \
+  "fmax   v8.4s,  v8.4s,  v0.4s\n"        \
+  "fmax   v9.4s,  v9.4s,  v0.4s\n"        \
+  "fmax   v10.4s, v10.4s, v0.4s\n"        \
+  "fmax   v11.4s, v11.4s, v0.4s\n"        \
+  "fmax   v12.4s, v12.4s, v0.4s\n"        \
+  "fmax   v13.4s, v13.4s, v0.4s\n"        \
+  "fmax   v14.4s, v14.4s, v0.4s\n"        \
+  "fmax   v15.4s, v15.4s, v0.4s\n"        \
+  "fmax   v16.4s, v16.4s, v0.4s\n"        \
+  "fmax   v17.4s, v17.4s, v0.4s\n"        \
+  "fmax   v18.4s, v18.4s, v0.4s\n"        \
+  "fmax   v19.4s, v19.4s, v0.4s\n"        \
+  "fmin   v8.4s,  v8.4s,  v1.4s\n"        \
+  "fmin   v9.4s,  v9.4s,  v1.4s\n"        \
+  "fmin   v10.4s, v10.4s, v1.4s\n"        \
+  "fmin   v11.4s, v11.4s, v1.4s\n"        \
+  "fmin   v12.4s, v12.4s, v1.4s\n"        \
+  "fmin   v13.4s, v13.4s, v1.4s\n"        \
+  "fmin   v14.4s, v14.4s, v1.4s\n"        \
+  "fmin   v15.4s, v15.4s, v1.4s\n"        \
+  "fmin   v16.4s, v16.4s, v1.4s\n"        \
+  "fmin   v17.4s, v17.4s, v1.4s\n"        \
+  "fmin   v18.4s, v18.4s, v1.4s\n"        \
+  "fmin   v19.4s, v19.4s, v1.4s\n"        \
+  "fmax   v20.4s, v20.4s, v0.4s\n"        \
+  "fmax   v21.4s, v21.4s, v0.4s\n"        \
+  "fmax   v22.4s, v22.4s, v0.4s\n"        \
+  "fmax   v23.4s, v23.4s, v0.4s\n"        \
+  "fmax   v24.4s, v24.4s, v0.4s\n"        \
+  "fmax   v25.4s, v25.4s, v0.4s\n"        \
+  "fmax   v26.4s, v26.4s, v0.4s\n"        \
+  "fmax   v27.4s, v27.4s, v0.4s\n"        \
+  "fmax   v28.4s, v28.4s, v0.4s\n"        \
+  "fmax   v29.4s, v29.4s, v0.4s\n"        \
+  "fmax   v30.4s, v30.4s, v0.4s\n"        \
+  "fmax   v31.4s, v31.4s, v0.4s\n"        \
+  "fmin   v20.4s, v20.4s, v1.4s\n"        \
+  "fmin   v21.4s, v21.4s, v1.4s\n"        \
+  "fmin   v22.4s, v22.4s, v1.4s\n"        \
+  "fmin   v23.4s, v23.4s, v1.4s\n"        \
+  "fmin   v24.4s, v24.4s, v1.4s\n"        \
+  "fmin   v25.4s, v25.4s, v1.4s\n"        \
+  "fmin   v26.4s, v26.4s, v1.4s\n"        \
+  "fmin   v27.4s, v27.4s, v1.4s\n"        \
+  "fmin   v28.4s, v28.4s, v1.4s\n"        \
+  "fmin   v29.4s, v29.4s, v1.4s\n"        \
+  "fmin   v30.4s, v30.4s, v1.4s\n"        \
+  "fmin   v31.4s, v31.4s, v1.4s\n"        \
+  "b      9f                    \n"
+
+#define SPARSE_F32_F32_W32_SEMI2_V8_RELU6 \
+  /* do relu6 */                          \
+  "10: \n"                                \
+  "cmp   %w[vflag_act],  #2       \n"     \
+  "bne   11f                     \n"      \
+  "movi   v0.4s, #0\n"                    \
+  "dup    v1.4s,  %w[valpha]\n"           \
+  "fmax   v16.4s, v16.4s, v0.4s\n"        \
+  "fmax   v17.4s, v17.4s, v0.4s\n"        \
+  "fmax   v18.4s, v18.4s, v0.4s\n"        \
+  "fmax   v19.4s, v19.4s, v0.4s\n"        \
+  "fmin   v16.4s, v16.4s, v1.4s\n"        \
+  "fmin   v17.4s, v17.4s, v1.4s\n"        \
+  "fmin   v18.4s, v18.4s, v1.4s\n"        \
+  "fmin   v19.4s, v19.4s, v1.4s\n"        \
+  "fmax   v20.4s, v20.4s, v0.4s\n"        \
+  "fmax   v21.4s, v21.4s, v0.4s\n"        \
+  "fmax   v22.4s, v22.4s, v0.4s\n"        \
+  "fmax   v23.4s, v23.4s, v0.4s\n"        \
+  "fmax   v24.4s, v24.4s, v0.4s\n"        \
+  "fmax   v25.4s, v25.4s, v0.4s\n"        \
+  "fmax   v26.4s, v26.4s, v0.4s\n"        \
+  "fmax   v27.4s, v27.4s, v0.4s\n"        \
+  "fmax   v28.4s, v28.4s, v0.4s\n"        \
+  "fmax   v29.4s, v29.4s, v0.4s\n"        \
+  "fmax   v30.4s, v30.4s, v0.4s\n"        \
+  "fmax   v31.4s, v31.4s, v0.4s\n"        \
+  "fmin   v20.4s, v20.4s, v1.4s\n"        \
+  "fmin   v21.4s, v21.4s, v1.4s\n"        \
+  "fmin   v22.4s, v22.4s, v1.4s\n"        \
+  "fmin   v23.4s, v23.4s, v1.4s\n"        \
+  "fmin   v24.4s, v24.4s, v1.4s\n"        \
+  "fmin   v25.4s, v25.4s, v1.4s\n"        \
+  "fmin   v26.4s, v26.4s, v1.4s\n"        \
+  "fmin   v27.4s, v27.4s, v1.4s\n"        \
+  "fmin   v28.4s, v28.4s, v1.4s\n"        \
+  "fmin   v29.4s, v29.4s, v1.4s\n"        \
+  "fmin   v30.4s, v30.4s, v1.4s\n"        \
+  "fmin   v31.4s, v31.4s, v1.4s\n"        \
+  "b      9f                    \n"
+
+#define SPARSE_F32_F32_W16_SEMI2_V8_RELU6 \
+  /* do relu6 */                          \
+  "10: \n"                                \
+  "cmp   %w[vflag_act],  #2       \n"     \
+  "bne   11f                     \n"      \
+  "movi   v0.4s, #0\n"                    \
+  "dup    v1.4s,  %w[valpha]\n"           \
+  "fmax   v24.4s, v24.4s, v0.4s\n"        \
+  "fmax   v25.4s, v25.4s, v0.4s\n"        \
+  "fmax   v26.4s, v26.4s, v0.4s\n"        \
+  "fmax   v27.4s, v27.4s, v0.4s\n"        \
+  "fmax   v28.4s, v28.4s, v0.4s\n"        \
+  "fmax   v29.4s, v29.4s, v0.4s\n"        \
+  "fmax   v30.4s, v30.4s, v0.4s\n"        \
+  "fmax   v31.4s, v31.4s, v0.4s\n"        \
+  "fmin   v24.4s, v24.4s, v1.4s\n"        \
+  "fmin   v25.4s, v25.4s, v1.4s\n"        \
+  "fmin   v26.4s, v26.4s, v1.4s\n"        \
+  "fmin   v27.4s, v27.4s, v1.4s\n"        \
+  "fmin   v28.4s, v28.4s, v1.4s\n"        \
+  "fmin   v29.4s, v29.4s, v1.4s\n"        \
+  "fmin   v30.4s, v30.4s, v1.4s\n"        \
+  "fmin   v31.4s, v31.4s, v1.4s\n"        \
+  "b      9f                    \n"
+
+#define SPARSE_F32_F32_W8_SEMI2_V8_RELU6 \
+  /* do relu6 */                         \
+  "10: \n"                               \
+  "cmp   %w[vflag_act],  #2       \n"    \
+  "bne   11f                     \n"     \
+  "movi   v0.4s, #0\n"                   \
+  "dup    v1.4s,  %w[valpha]\n"          \
+  "fmax   v28.4s, v28.4s, v0.4s\n"       \
+  "fmax   v29.4s, v29.4s, v0.4s\n"       \
+  "fmax   v30.4s, v30.4s, v0.4s\n"       \
+  "fmax   v31.4s, v31.4s, v0.4s\n"       \
+  "fmin   v28.4s, v28.4s, v1.4s\n"       \
+  "fmin   v29.4s, v29.4s, v1.4s\n"       \
+  "fmin   v30.4s, v30.4s, v1.4s\n"       \
+  "fmin   v31.4s, v31.4s, v1.4s\n"       \
+  "b      9f                    \n"
+
+#define SPARSE_F32_F32_W4_SEMI2_V8_RELU6 \
+  /* do relu6 */                         \
+  "10: \n"                               \
+  "cmp   %w[vflag_act],  #2       \n"    \
+  "bne   11f                     \n"     \
+  "movi   v0.4s, #0\n"                   \
+  "dup    v1.4s,  %w[valpha]\n"          \
+  "fmax   v30.4s, v30.4s, v0.4s\n"       \
+  "fmax   v31.4s, v31.4s, v0.4s\n"       \
+  "fmin   v30.4s, v30.4s, v1.4s\n"       \
+  "fmin   v31.4s, v31.4s, v1.4s\n"       \
+  "b      9f                    \n"
+
+#define SPARSE_F32_F32_W48_SEMI1_V8_RELU6 \
+  /* do relu6 */                          \
+  "10: \n"                                \
+  "cmp   %w[vflag_act],  #2       \n"     \
+  "bne   11f                     \n"      \
+  "movi   v0.4s, #0\n"                    \
+  "dup    v1.4s,  %w[valpha]\n"           \
+  "fmax   v20.4s, v20.4s, v0.4s\n"        \
+  "fmax   v21.4s, v21.4s, v0.4s\n"        \
+  "fmax   v22.4s, v22.4s, v0.4s\n"        \
+  "fmax   v23.4s, v23.4s, v0.4s\n"        \
+  "fmax   v24.4s, v24.4s, v0.4s\n"        \
+  "fmax   v25.4s, v25.4s, v0.4s\n"        \
+  "fmax   v26.4s, v26.4s, v0.4s\n"        \
+  "fmax   v27.4s, v27.4s, v0.4s\n"        \
+  "fmax   v28.4s, v28.4s, v0.4s\n"        \
+  "fmax   v29.4s, v29.4s, v0.4s\n"        \
+  "fmax   v30.4s, v30.4s, v0.4s\n"        \
+  "fmax   v31.4s, v31.4s, v0.4s\n"        \
+  "fmin   v20.4s, v20.4s, v1.4s\n"        \
+  "fmin   v21.4s, v21.4s, v1.4s\n"        \
+  "fmin   v22.4s, v22.4s, v1.4s\n"        \
+  "fmin   v23.4s, v23.4s, v1.4s\n"        \
+  "fmin   v24.4s, v24.4s, v1.4s\n"        \
+  "fmin   v25.4s, v25.4s, v1.4s\n"        \
+  "fmin   v26.4s, v26.4s, v1.4s\n"        \
+  "fmin   v27.4s, v27.4s, v1.4s\n"        \
+  "fmin   v28.4s, v28.4s, v1.4s\n"        \
+  "fmin   v29.4s, v29.4s, v1.4s\n"        \
+  "fmin   v30.4s, v30.4s, v1.4s\n"        \
+  "fmin   v31.4s, v31.4s, v1.4s\n"        \
+  "b      9f                    \n"
+
+#define SPARSE_F32_F32_W32_SEMI1_V8_RELU6 \
+  /* do relu6 */                          \
+  "10: \n"                                \
+  "cmp   %w[vflag_act],  #2       \n"     \
+  "bne   11f                     \n"      \
+  "movi   v0.4s, #0\n"                    \
+  "dup    v1.4s,  %w[valpha]\n"           \
+  "fmax   v20.4s, v20.4s, v0.4s\n"        \
+  "fmax   v21.4s, v21.4s, v0.4s\n"        \
+  "fmax   v22.4s, v22.4s, v0.4s\n"        \
+  "fmax   v23.4s, v23.4s, v0.4s\n"        \
+  "fmax   v24.4s, v24.4s, v0.4s\n"        \
+  "fmax   v25.4s, v25.4s, v0.4s\n"        \
+  "fmax   v26.4s, v26.4s, v0.4s\n"        \
+  "fmax   v27.4s, v27.4s, v0.4s\n"        \
+  "fmin   v20.4s, v20.4s, v1.4s\n"        \
+  "fmin   v21.4s, v21.4s, v1.4s\n"        \
+  "fmin   v22.4s, v22.4s, v1.4s\n"        \
+  "fmin   v23.4s, v23.4s, v1.4s\n"        \
+  "fmin   v24.4s, v24.4s, v1.4s\n"        \
+  "fmin   v25.4s, v25.4s, v1.4s\n"        \
+  "fmin   v26.4s, v26.4s, v1.4s\n"        \
+  "fmin   v27.4s, v27.4s, v1.4s\n"        \
+  "b      9f                    \n"
+
+#define SPARSE_F32_F32_W16_SEMI1_V8_RELU6 \
+  /* do relu6 */                          \
+  "10: \n"                                \
+  "cmp   %w[vflag_act],  #2       \n"     \
+  "bne   11f                     \n"      \
+  "movi   v0.4s, #0\n"                    \
+  "dup    v1.4s,  %w[valpha]\n"           \
+  "fmax   v20.4s, v20.4s, v0.4s\n"        \
+  "fmax   v21.4s, v21.4s, v0.4s\n"        \
+  "fmax   v22.4s, v22.4s, v0.4s\n"        \
+  "fmax   v23.4s, v23.4s, v0.4s\n"        \
+  "fmin   v20.4s, v20.4s, v1.4s\n"        \
+  "fmin   v21.4s, v21.4s, v1.4s\n"        \
+  "fmin   v22.4s, v22.4s, v1.4s\n"        \
+  "fmin   v23.4s, v23.4s, v1.4s\n"        \
+  "b      9f                    \n"
+
+#define SPARSE_F32_F32_W8_SEMI1_V8_RELU6 \
+  /* do relu6 */                         \
+  "10: \n"                               \
+  "cmp   %w[vflag_act],  #2       \n"    \
+  "bne   11f                     \n"     \
+  "movi   v0.4s, #0\n"                   \
+  "dup    v1.4s,  %w[valpha]\n"          \
+  "fmax   v20.4s, v20.4s, v0.4s\n"       \
+  "fmax   v21.4s, v21.4s, v0.4s\n"       \
+  "fmin   v20.4s, v20.4s, v1.4s\n"       \
+  "fmin   v21.4s, v21.4s, v1.4s\n"       \
+  "b      9f                    \n"
+
+#define SPARSE_F32_F32_W4_SEMI1_V8_RELU6 \
+  /* do relu6 */                         \
+  "10: \n"                               \
+  "cmp   %w[vflag_act],  #2       \n"    \
+  "bne   11f                     \n"     \
+  "movi   v0.4s, #0\n"                   \
+  "dup    v1.4s,  %w[valpha]\n"          \
+  "fmax   v20.4s, v20.4s, v0.4s\n"       \
+  "fmin   v20.4s, v20.4s, v1.4s\n"       \
+  "b      9f                    \n"
+
+#define SPARSE_F32_F32_W48_SEMI2_V8_LEAKY_RELU                     \
+  /* do relu */                                                    \
+  "11: \n"                                                         \
+  "cmp    %w[vflag_act],  #3       \n"                             \
+  "bne    12f                     \n"                              \
+  "movi   v0.4s, #0\n"                     /* for relu6 */         \
+  "dup    v1.4s,  %w[valpha]\n"            /* leakey relu alpha */ \
+  "fcmge  v2.4s,    v8.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v3.4s,    v8.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v4.4s,    v9.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v5.4s,    v9.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v6.4s,    v10.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v7.4s,    v10.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "bif    v8.16b,  v3.16b,   v2.16b  \n"   /* choose*/             \
+  "bif    v9.16b,  v5.16b,   v4.16b  \n"   /* choose*/             \
+  "bif    v10.16b,  v7.16b,   v6.16b  \n"  /* choose*/             \
+  "fcmge  v2.4s,    v11.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v3.4s,    v11.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v4.4s,    v12.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v5.4s,    v12.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v6.4s,    v13.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v7.4s,    v13.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "bif    v11.16b,  v3.16b,   v2.16b  \n"  /* choose*/             \
+  "bif    v12.16b,  v5.16b,   v4.16b  \n"  /* choose*/             \
+  "bif    v13.16b,  v7.16b,   v6.16b  \n"  /* choose*/             \
+  "fcmge  v2.4s,    v14.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v3.4s,    v14.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v4.4s,    v15.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v5.4s,    v15.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v6.4s,    v16.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v7.4s,    v16.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "bif    v14.16b,  v3.16b,   v2.16b  \n"  /* choose*/             \
+  "bif    v15.16b,  v5.16b,   v4.16b  \n"  /* choose*/             \
+  "bif    v16.16b,  v7.16b,   v6.16b  \n"  /* choose*/             \
+  "fcmge  v2.4s,    v17.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v3.4s,    v17.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v4.4s,    v18.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v5.4s,    v18.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v6.4s,    v19.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v7.4s,    v19.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "bif    v17.16b,  v3.16b,   v2.16b  \n"  /* choose*/             \
+  "bif    v18.16b,  v5.16b,   v4.16b  \n"  /* choose*/             \
+  "bif    v19.16b,  v7.16b,   v6.16b  \n"  /* choose*/             \
+  "fcmge  v2.4s,    v20.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v3.4s,    v20.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v4.4s,    v21.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v5.4s,    v21.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v6.4s,    v22.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v7.4s,    v22.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "bif    v20.16b,  v3.16b,   v2.16b  \n"  /* choose*/             \
+  "bif    v21.16b,  v5.16b,   v4.16b  \n"  /* choose*/             \
+  "bif    v22.16b,  v7.16b,   v6.16b  \n"  /* choose*/             \
+  "fcmge  v2.4s,    v23.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v3.4s,    v23.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v4.4s,    v24.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v5.4s,    v24.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v6.4s,    v25.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v7.4s,    v25.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "bif    v23.16b,  v3.16b,   v2.16b  \n"  /* choose*/             \
+  "bif    v24.16b,  v5.16b,   v4.16b  \n"  /* choose*/             \
+  "bif    v25.16b,  v7.16b,   v6.16b  \n"  /* choose*/             \
+  "fcmge  v2.4s,    v26.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v3.4s,    v26.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v4.4s,    v27.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v5.4s,    v27.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v6.4s,    v28.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v7.4s,    v28.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "bif    v26.16b,  v3.16b,   v2.16b  \n"  /* choose*/             \
+  "bif    v27.16b,  v5.16b,   v4.16b  \n"  /* choose*/             \
+  "bif    v28.16b,  v7.16b,   v6.16b  \n"  /* choose*/             \
+  "fcmge  v2.4s,    v29.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v3.4s,    v29.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v4.4s,    v30.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v5.4s,    v30.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v6.4s,    v31.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v7.4s,    v31.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "bif    v29.16b,  v3.16b,   v2.16b  \n"  /* choose*/             \
+  "bif    v30.16b,  v5.16b,   v4.16b  \n"  /* choose*/             \
+  "bif    v31.16b,  v7.16b,   v6.16b  \n"  /* choose*/             \
+  "b      9f                    \n"
+
+#define SPARSE_F32_F32_W48_SEMI2_V8_HARD_SWISH                   \
+  /* do hard_swish */                                            \
+  "12: \n"                                                       \
+  "movi   v0.4s,    #0                \n"    /* for hardswish */ \
+  "ldr    q1,  [%[hs_param], #0]         \n" /* offset */        \
+  "ldr    q2,  [%[hs_param], #16]        \n" /* scale */         \
+  "ldr    q3,  [%[hs_param], #32]        \n" /* threshold */     \
+  "fadd   v4.4s,  v8.4s, v1.4s        \n"                        \
+  "fadd   v6.4s,  v9.4s, v1.4s        \n"                        \
+  "fmul   v5.4s,   v8.4s, v2.4s        \n"                       \
+  "fmul   v7.4s,   v9.4s, v2.4s        \n"                       \
+  "fmax   v4.4s,  v4.4s, v0.4s        \n"                        \
+  "fmax   v6.4s,  v6.4s, v0.4s        \n"                        \
+  "fmin   v4.4s,  v4.4s, v3.4s        \n"                        \
+  "fmin   v6.4s,  v6.4s, v3.4s        \n"                        \
+  "fmul   v8.4s,  v5.4s,  v4.4s        \n"                       \
+  "fmul   v9.4s,  v7.4s,  v6.4s        \n"                       \
+  "fadd   v4.4s,  v10.4s, v1.4s        \n"                       \
+  "fadd   v6.4s,  v11.4s, v1.4s        \n"                       \
+  "fmul   v5.4s,   v10.4s, v2.4s        \n"                      \
+  "fmul   v7.4s,   v11.4s, v2.4s        \n"                      \
+  "fmax   v4.4s,  v4.4s, v0.4s        \n"                        \
+  "fmax   v6.4s,  v6.4s, v0.4s        \n"                        \
+  "fmin   v4.4s,  v4.4s, v3.4s        \n"                        \
+  "fmin   v6.4s,  v6.4s, v3.4s        \n"                        \
+  "fmul   v10.4s,  v5.4s,  v4.4s        \n"                      \
+  "fmul   v11.4s,  v7.4s,  v6.4s        \n"                      \
+  "fadd   v4.4s,  v12.4s, v1.4s        \n"                       \
+  "fadd   v6.4s,  v13.4s, v1.4s        \n"                       \
+  "fmul   v5.4s,   v12.4s, v2.4s        \n"                      \
+  "fmul   v7.4s,   v13.4s, v2.4s        \n"                      \
+  "fmax   v4.4s,  v4.4s, v0.4s        \n"                        \
+  "fmax   v6.4s,  v6.4s, v0.4s        \n"                        \
+  "fmin   v4.4s,  v4.4s, v3.4s        \n"                        \
+  "fmin   v6.4s,  v6.4s, v3.4s        \n"                        \
+  "fmul   v12.4s,  v5.4s,  v4.4s        \n"                      \
+  "fmul   v13.4s,  v7.4s,  v6.4s        \n"                      \
+  "fadd   v4.4s,  v14.4s, v1.4s        \n"                       \
+  "fadd   v6.4s,  v15.4s, v1.4s        \n"                       \
+  "fmul   v5.4s,   v14.4s, v2.4s        \n"                      \
+  "fmul   v7.4s,   v15.4s, v2.4s        \n"                      \
+  "fmax   v4.4s,  v4.4s, v0.4s        \n"                        \
+  "fmax   v6.4s,  v6.4s, v0.4s        \n"                        \
+  "fmin   v4.4s,  v4.4s, v3.4s        \n"                        \
+  "fmin   v6.4s,  v6.4s, v3.4s        \n"                        \
+  "fmul   v14.4s,  v5.4s,  v4.4s        \n"                      \
+  "fmul   v15.4s,  v7.4s,  v6.4s        \n"                      \
+  "fadd   v4.4s,  v16.4s, v1.4s        \n"                       \
+  "fadd   v6.4s,  v17.4s, v1.4s        \n"                       \
+  "fmul   v5.4s,   v16.4s, v2.4s        \n"                      \
+  "fmul   v7.4s,   v17.4s, v2.4s        \n"                      \
+  "fmax   v4.4s,  v4.4s, v0.4s        \n"                        \
+  "fmax   v6.4s,  v6.4s, v0.4s        \n"                        \
+  "fmin   v4.4s,  v4.4s, v3.4s        \n"                        \
+  "fmin   v6.4s,  v6.4s, v3.4s        \n"                        \
+  "fmul   v16.4s,  v5.4s,  v4.4s        \n"                      \
+  "fmul   v17.4s,  v7.4s,  v6.4s        \n"                      \
+  "fadd   v4.4s,  v18.4s, v1.4s        \n"                       \
+  "fadd   v6.4s,  v19.4s, v1.4s        \n"                       \
+  "fmul   v5.4s,   v18.4s, v2.4s        \n"                      \
+  "fmul   v7.4s,   v19.4s, v2.4s        \n"                      \
+  "fmax   v4.4s,  v4.4s, v0.4s        \n"                        \
+  "fmax   v6.4s,  v6.4s, v0.4s        \n"                        \
+  "fmin   v4.4s,  v4.4s, v3.4s        \n"                        \
+  "fmin   v6.4s,  v6.4s, v3.4s        \n"                        \
+  "fmul   v18.4s,  v5.4s,  v4.4s        \n"                      \
+  "fmul   v19.4s,  v7.4s,  v6.4s        \n"                      \
+  "fadd   v4.4s,  v20.4s, v1.4s        \n"                       \
+  "fadd   v6.4s,  v21.4s, v1.4s        \n"                       \
+  "fmul   v5.4s,   v20.4s, v2.4s        \n"                      \
+  "fmul   v7.4s,   v21.4s, v2.4s        \n"                      \
+  "fmax   v4.4s,  v4.4s, v0.4s        \n"                        \
+  "fmax   v6.4s,  v6.4s, v0.4s        \n"                        \
+  "fmin   v4.4s,  v4.4s, v3.4s        \n"                        \
+  "fmin   v6.4s,  v6.4s, v3.4s        \n"                        \
+  "fmul   v20.4s,  v5.4s,  v4.4s        \n"                      \
+  "fmul   v21.4s,  v7.4s,  v6.4s        \n"                      \
+  "fadd   v4.4s,  v22.4s, v1.4s        \n"                       \
+  "fadd   v6.4s,  v23.4s, v1.4s        \n"                       \
+  "fmul   v5.4s,   v22.4s, v2.4s        \n"                      \
+  "fmul   v7.4s,   v23.4s, v2.4s        \n"                      \
+  "fmax   v4.4s,  v4.4s, v0.4s        \n"                        \
+  "fmax   v6.4s,  v6.4s, v0.4s        \n"                        \
+  "fmin   v4.4s,  v4.4s, v3.4s        \n"                        \
+  "fmin   v6.4s,  v6.4s, v3.4s        \n"                        \
+  "fmul   v22.4s,  v5.4s,  v4.4s        \n"                      \
+  "fmul   v23.4s,  v7.4s,  v6.4s        \n"                      \
+  "fadd   v4.4s,  v24.4s, v1.4s        \n"                       \
+  "fadd   v6.4s,  v25.4s, v1.4s        \n"                       \
+  "fmul   v5.4s,   v24.4s, v2.4s        \n"                      \
+  "fmul   v7.4s,   v25.4s, v2.4s        \n"                      \
+  "fmax   v4.4s,  v4.4s, v0.4s        \n"                        \
+  "fmax   v6.4s,  v6.4s, v0.4s        \n"                        \
+  "fmin   v4.4s,  v4.4s, v3.4s        \n"                        \
+  "fmin   v6.4s,  v6.4s, v3.4s        \n"                        \
+  "fmul   v24.4s,  v5.4s,  v4.4s        \n"                      \
+  "fmul   v25.4s,  v7.4s,  v6.4s        \n"                      \
+  "fadd   v4.4s,  v26.4s, v1.4s        \n"                       \
+  "fadd   v6.4s,  v27.4s, v1.4s        \n"                       \
+  "fmul   v5.4s,   v26.4s, v2.4s        \n"                      \
+  "fmul   v7.4s,   v27.4s, v2.4s        \n"                      \
+  "fmax   v4.4s,  v4.4s, v0.4s        \n"                        \
+  "fmax   v6.4s,  v6.4s, v0.4s        \n"                        \
+  "fmin   v4.4s,  v4.4s, v3.4s        \n"                        \
+  "fmin   v6.4s,  v6.4s, v3.4s        \n"                        \
+  "fmul   v26.4s,  v5.4s,  v4.4s        \n"                      \
+  "fmul   v27.4s,  v7.4s,  v6.4s        \n"                      \
+  "fadd   v4.4s,  v28.4s, v1.4s        \n"                       \
+  "fadd   v6.4s,  v29.4s, v1.4s        \n"                       \
+  "fmul   v5.4s,   v28.4s, v2.4s        \n"                      \
+  "fmul   v7.4s,   v29.4s, v2.4s        \n"                      \
+  "fmax   v4.4s,  v4.4s, v0.4s        \n"                        \
+  "fmax   v6.4s,  v6.4s, v0.4s        \n"                        \
+  "fmin   v4.4s,  v4.4s, v3.4s        \n"                        \
+  "fmin   v6.4s,  v6.4s, v3.4s        \n"                        \
+  "fmul   v28.4s,  v5.4s,  v4.4s        \n"                      \
+  "fmul   v29.4s,  v7.4s,  v6.4s        \n"                      \
+  "fadd   v4.4s,  v30.4s, v1.4s        \n"                       \
+  "fadd   v6.4s,  v31.4s, v1.4s        \n"                       \
+  "fmul   v5.4s,   v30.4s, v2.4s        \n"                      \
+  "fmul   v7.4s,   v31.4s, v2.4s        \n"                      \
+  "fmax   v4.4s,  v4.4s, v0.4s        \n"                        \
+  "fmax   v6.4s,  v6.4s, v0.4s        \n"                        \
+  "fmin   v4.4s,  v4.4s, v3.4s        \n"                        \
+  "fmin   v6.4s,  v6.4s, v3.4s        \n"                        \
+  "fmul   v30.4s,  v5.4s,  v4.4s        \n"                      \
+  "fmul   v31.4s,  v7.4s,  v6.4s        \n"                      \
+  "9:\n"
+
+#define SPARSE_F32_F32_W32_SEMI2_V8_LEAKY_RELU                     \
+  /* do relu */                                                    \
+  "11: \n"                                                         \
+  "cmp    %w[vflag_act],  #3       \n"                             \
+  "bne    12f                     \n"                              \
+  "movi   v0.4s, #0\n"                     /* for relu6 */         \
+  "dup    v1.4s,  %w[valpha]\n"            /* leakey relu alpha */ \
+  "fcmge  v2.4s,    v16.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v3.4s,    v16.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v4.4s,    v17.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v5.4s,    v17.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v6.4s,    v18.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v7.4s,    v18.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v8.4s,    v19.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v9.4s,    v19.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v10.4s,   v20.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v11.4s,   v20.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v12.4s,   v21.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v13.4s,   v21.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v14.4s,   v22.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v15.4s,   v22.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "bif    v16.16b,  v3.16b,   v2.16b  \n"  /* choose*/             \
+  "bif    v17.16b,  v5.16b,   v4.16b  \n"  /* choose*/             \
+  "bif    v18.16b,  v7.16b,   v6.16b  \n"  /* choose*/             \
+  "bif    v19.16b,  v9.16b,   v8.16b  \n"  /* choose*/             \
+  "bif    v20.16b,  v11.16b,  v10.16b  \n" /* choose*/             \
+  "bif    v21.16b,  v13.16b,  v12.16b  \n" /* choose*/             \
+  "bif    v22.16b,  v15.16b,  v14.16b  \n" /* choose*/             \
+  "fcmge  v2.4s,    v23.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v3.4s,    v23.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v4.4s,    v24.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v5.4s,    v24.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v6.4s,    v25.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v7.4s,    v25.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v8.4s,    v26.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v9.4s,    v26.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v10.4s,   v27.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v11.4s,   v27.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v12.4s,   v28.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v13.4s,   v28.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v14.4s,   v29.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v15.4s,   v29.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "bif    v23.16b,  v3.16b,   v2.16b  \n"  /* choose*/             \
+  "bif    v24.16b,  v5.16b,   v4.16b  \n"  /* choose*/             \
+  "bif    v25.16b,  v7.16b,   v6.16b  \n"  /* choose*/             \
+  "bif    v26.16b,  v9.16b,   v8.16b  \n"  /* choose*/             \
+  "bif    v27.16b,  v11.16b,  v10.16b  \n" /* choose*/             \
+  "bif    v28.16b,  v13.16b,  v12.16b  \n" /* choose*/             \
+  "bif    v29.16b,  v15.16b,  v14.16b  \n" /* choose*/             \
+  "fcmge  v2.4s,    v30.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v3.4s,    v30.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v4.4s,    v31.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v5.4s,    v31.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "bif    v30.16b,  v3.16b,   v2.16b  \n"  /* choose*/             \
+  "bif    v31.16b,  v5.16b,   v4.16b  \n"  /* choose*/             \
+  "b      9f                    \n"
+
+#define SPARSE_F32_F32_W32_SEMI2_V8_HARD_SWISH                   \
+  /* do hard_swish */                                            \
+  "12: \n"                                                       \
+  "movi   v0.4s,    #0                \n"    /* for hardswish */ \
+  "ldr    q1,  [%[hs_param], #0]         \n" /* offset */        \
+  "ldr    q2,  [%[hs_param], #16]        \n" /* scale */         \
+  "ldr    q3,  [%[hs_param], #32]        \n" /* threshold */     \
+  "fadd   v4.4s,  v16.4s, v1.4s        \n"                       \
+  "fadd   v6.4s,  v17.4s, v1.4s        \n"                       \
+  "fadd   v8.4s,  v18.4s, v1.4s        \n"                       \
+  "fadd   v10.4s,  v19.4s, v1.4s        \n"                      \
+  "fadd   v12.4s,  v20.4s, v1.4s        \n"                      \
+  "fadd   v14.4s,  v21.4s, v1.4s        \n"                      \
+  "fmul   v5.4s,   v16.4s, v2.4s        \n"                      \
+  "fmul   v7.4s,   v17.4s, v2.4s        \n"                      \
+  "fmul   v9.4s,   v18.4s, v2.4s        \n"                      \
+  "fmul   v11.4s,   v19.4s, v2.4s        \n"                     \
+  "fmul   v13.4s,   v20.4s, v2.4s        \n"                     \
+  "fmul   v15.4s,   v21.4s, v2.4s        \n"                     \
+  "fmax   v4.4s,  v4.4s, v0.4s        \n"                        \
+  "fmax   v6.4s,  v6.4s, v0.4s        \n"                        \
+  "fmax   v8.4s,  v8.4s, v0.4s        \n"                        \
+  "fmax   v10.4s,  v10.4s, v0.4s        \n"                      \
+  "fmax   v12.4s,  v12.4s, v0.4s        \n"                      \
+  "fmax   v14.4s,  v14.4s, v0.4s        \n"                      \
+  "fmin   v4.4s,  v4.4s, v3.4s        \n"                        \
+  "fmin   v6.4s,  v6.4s, v3.4s        \n"                        \
+  "fmin   v8.4s,  v8.4s, v3.4s        \n"                        \
+  "fmin   v10.4s,  v10.4s, v3.4s        \n"                      \
+  "fmin   v12.4s,  v12.4s, v3.4s        \n"                      \
+  "fmin   v14.4s,  v14.4s, v3.4s        \n"                      \
+  "fmul   v16.4s,  v5.4s,  v4.4s        \n"                      \
+  "fmul   v17.4s,  v7.4s,  v6.4s        \n"                      \
+  "fmul   v18.4s,  v9.4s,  v8.4s        \n"                      \
+  "fmul   v19.4s,  v11.4s,  v10.4s        \n"                    \
+  "fmul   v20.4s,  v13.4s,  v12.4s        \n"                    \
+  "fmul   v21.4s,  v15.4s,  v14.4s        \n"                    \
+  "fadd   v4.4s,  v22.4s, v1.4s        \n"                       \
+  "fadd   v6.4s,  v23.4s, v1.4s        \n"                       \
+  "fadd   v8.4s,  v24.4s, v1.4s        \n"                       \
+  "fadd   v10.4s,  v25.4s, v1.4s        \n"                      \
+  "fadd   v12.4s,  v26.4s, v1.4s        \n"                      \
+  "fadd   v14.4s,  v27.4s, v1.4s        \n"                      \
+  "fmul   v5.4s,   v22.4s, v2.4s        \n"                      \
+  "fmul   v7.4s,   v23.4s, v2.4s        \n"                      \
+  "fmul   v9.4s,   v24.4s, v2.4s        \n"                      \
+  "fmul   v11.4s,   v25.4s, v2.4s        \n"                     \
+  "fmul   v13.4s,   v26.4s, v2.4s        \n"                     \
+  "fmul   v15.4s,   v27.4s, v2.4s        \n"                     \
+  "fmax   v4.4s,  v4.4s, v0.4s        \n"                        \
+  "fmax   v6.4s,  v6.4s, v0.4s        \n"                        \
+  "fmax   v8.4s,  v8.4s, v0.4s        \n"                        \
+  "fmax   v10.4s,  v10.4s, v0.4s        \n"                      \
+  "fmax   v12.4s,  v12.4s, v0.4s        \n"                      \
+  "fmax   v14.4s,  v14.4s, v0.4s        \n"                      \
+  "fmin   v4.4s,  v4.4s, v3.4s        \n"                        \
+  "fmin   v6.4s,  v6.4s, v3.4s        \n"                        \
+  "fmin   v8.4s,  v8.4s, v3.4s        \n"                        \
+  "fmin   v10.4s,  v10.4s, v3.4s        \n"                      \
+  "fmin   v12.4s,  v12.4s, v3.4s        \n"                      \
+  "fmin   v14.4s,  v14.4s, v3.4s        \n"                      \
+  "fmul   v22.4s,  v5.4s,  v4.4s        \n"                      \
+  "fmul   v23.4s,  v7.4s,  v6.4s        \n"                      \
+  "fmul   v24.4s,  v9.4s,  v8.4s        \n"                      \
+  "fmul   v25.4s,  v11.4s,  v10.4s        \n"                    \
+  "fmul   v26.4s,  v13.4s,  v12.4s        \n"                    \
+  "fmul   v27.4s,  v15.4s,  v14.4s        \n"                    \
+  "fadd   v4.4s,  v28.4s, v1.4s        \n"                       \
+  "fadd   v6.4s,  v29.4s, v1.4s        \n"                       \
+  "fadd   v8.4s,  v30.4s, v1.4s        \n"                       \
+  "fadd   v10.4s,  v31.4s, v1.4s        \n"                      \
+  "fmul   v5.4s,   v28.4s, v2.4s        \n"                      \
+  "fmul   v7.4s,   v29.4s, v2.4s        \n"                      \
+  "fmul   v9.4s,   v30.4s, v2.4s        \n"                      \
+  "fmul   v11.4s,   v31.4s, v2.4s        \n"                     \
+  "fmax   v4.4s,  v4.4s, v0.4s        \n"                        \
+  "fmax   v6.4s,  v6.4s, v0.4s        \n"                        \
+  "fmax   v8.4s,  v8.4s, v0.4s        \n"                        \
+  "fmax   v10.4s,  v10.4s, v0.4s        \n"                      \
+  "fmin   v4.4s,  v4.4s, v3.4s        \n"                        \
+  "fmin   v6.4s,  v6.4s, v3.4s        \n"                        \
+  "fmin   v8.4s,  v8.4s, v3.4s        \n"                        \
+  "fmin   v10.4s,  v10.4s, v3.4s        \n"                      \
+  "fmul   v28.4s,  v5.4s,  v4.4s        \n"                      \
+  "fmul   v29.4s,  v7.4s,  v6.4s        \n"                      \
+  "fmul   v30.4s,  v9.4s,  v8.4s        \n"                      \
+  "fmul   v31.4s,  v11.4s,  v10.4s        \n"                    \
+  "9:\n"
+
+#define SPARSE_F32_F32_W16_SEMI2_V8_LEAKY_RELU                     \
+  /* do relu */                                                    \
+  "11: \n"                                                         \
+  "cmp    %w[vflag_act],  #3       \n"                             \
+  "bne    12f                     \n"                              \
+  "movi   v0.4s, #0\n"                     /* for relu6 */         \
+  "dup    v1.4s,  %w[valpha]\n"            /* leakey relu alpha */ \
+  "fcmge  v2.4s,    v24.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v3.4s,    v24.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v4.4s,    v25.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v5.4s,    v25.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v6.4s,    v26.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v7.4s,    v26.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v8.4s,    v27.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v9.4s,    v27.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v10.4s,   v28.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v11.4s,   v28.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v12.4s,   v29.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v13.4s,   v29.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v14.4s,   v30.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v15.4s,   v30.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v16.4s,   v31.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v17.4s,   v31.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "bif    v24.16b,  v3.16b,   v2.16b  \n"  /* choose*/             \
+  "bif    v25.16b,  v5.16b,   v4.16b  \n"  /* choose*/             \
+  "bif    v26.16b,  v7.16b,   v6.16b  \n"  /* choose*/             \
+  "bif    v27.16b,  v9.16b,   v8.16b  \n"  /* choose*/             \
+  "bif    v28.16b,  v11.16b,  v10.16b  \n" /* choose*/             \
+  "bif    v29.16b,  v13.16b,  v12.16b  \n" /* choose*/             \
+  "bif    v30.16b,  v15.16b,  v14.16b  \n" /* choose*/             \
+  "bif    v31.16b,  v17.16b,  v16.16b  \n" /* choose*/             \
+  "b      9f                    \n"
+
+#define SPARSE_F32_F32_W16_SEMI2_V8_HARD_SWISH                   \
+  /* do hard_swish */                                            \
+  "12: \n"                                                       \
+  "movi   v0.4s,    #0                \n"    /* for hardswish */ \
+  "ldr    q1,  [%[hs_param], #0]         \n" /* offset */        \
+  "ldr    q2,  [%[hs_param], #16]        \n" /* scale */         \
+  "ldr    q3,  [%[hs_param], #32]        \n" /* threshold */     \
+  "fadd   v4.4s,  v24.4s, v1.4s        \n"                       \
+  "fadd   v6.4s,  v25.4s, v1.4s        \n"                       \
+  "fadd   v8.4s,  v26.4s, v1.4s        \n"                       \
+  "fadd   v10.4s,  v27.4s, v1.4s        \n"                      \
+  "fadd   v12.4s,  v28.4s, v1.4s        \n"                      \
+  "fadd   v14.4s,  v29.4s, v1.4s        \n"                      \
+  "fadd   v16.4s,  v30.4s, v1.4s        \n"                      \
+  "fadd   v18.4s,  v31.4s, v1.4s        \n"                      \
+  "fmul   v5.4s,   v24.4s, v2.4s        \n"                      \
+  "fmul   v7.4s,   v25.4s, v2.4s        \n"                      \
+  "fmul   v9.4s,   v26.4s, v2.4s        \n"                      \
+  "fmul   v11.4s,   v27.4s, v2.4s        \n"                     \
+  "fmul   v13.4s,   v28.4s, v2.4s        \n"                     \
+  "fmul   v15.4s,   v29.4s, v2.4s        \n"                     \
+  "fmul   v17.4s,   v30.4s, v2.4s        \n"                     \
+  "fmul   v19.4s,   v31.4s, v2.4s        \n"                     \
+  "fmax   v4.4s,  v4.4s, v0.4s        \n"                        \
+  "fmax   v6.4s,  v6.4s, v0.4s        \n"                        \
+  "fmax   v8.4s,  v8.4s, v0.4s        \n"                        \
+  "fmax   v10.4s,  v10.4s, v0.4s        \n"                      \
+  "fmax   v12.4s,  v12.4s, v0.4s        \n"                      \
+  "fmax   v14.4s,  v14.4s, v0.4s        \n"                      \
+  "fmax   v16.4s,  v16.4s, v0.4s        \n"                      \
+  "fmax   v18.4s,  v18.4s, v0.4s        \n"                      \
+  "fmin   v4.4s,  v4.4s, v3.4s        \n"                        \
+  "fmin   v6.4s,  v6.4s, v3.4s        \n"                        \
+  "fmin   v8.4s,  v8.4s, v3.4s        \n"                        \
+  "fmin   v10.4s,  v10.4s, v3.4s        \n"                      \
+  "fmin   v12.4s,  v12.4s, v3.4s        \n"                      \
+  "fmin   v14.4s,  v14.4s, v3.4s        \n"                      \
+  "fmin   v16.4s,  v16.4s, v3.4s        \n"                      \
+  "fmin   v18.4s,  v18.4s, v3.4s        \n"                      \
+  "fmul   v24.4s,  v5.4s,  v4.4s        \n"                      \
+  "fmul   v25.4s,  v7.4s,  v6.4s        \n"                      \
+  "fmul   v26.4s,  v9.4s,  v8.4s        \n"                      \
+  "fmul   v27.4s,  v11.4s,  v10.4s        \n"                    \
+  "fmul   v28.4s,  v13.4s,  v12.4s        \n"                    \
+  "fmul   v29.4s,  v15.4s,  v14.4s        \n"                    \
+  "fmul   v30.4s,  v17.4s,  v16.4s        \n"                    \
+  "fmul   v31.4s,  v19.4s,  v18.4s        \n"                    \
+  "9:\n"
+
+#define SPARSE_F32_F32_W8_SEMI2_V8_LEAKY_RELU                      \
+  /* do relu */                                                    \
+  "11: \n"                                                         \
+  "cmp    %w[vflag_act],  #3       \n"                             \
+  "bne    12f                     \n"                              \
+  "movi   v0.4s, #0\n"                     /* for relu6 */         \
+  "dup    v1.4s,  %w[valpha]\n"            /* leakey relu alpha */ \
+  "fcmge  v10.4s,   v28.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v11.4s,   v28.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v12.4s,   v29.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v13.4s,   v29.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v14.4s,   v30.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v15.4s,   v30.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v16.4s,   v31.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v17.4s,   v31.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "bif    v28.16b,  v11.16b,  v10.16b  \n" /* choose*/             \
+  "bif    v29.16b,  v13.16b,  v12.16b  \n" /* choose*/             \
+  "bif    v30.16b,  v15.16b,  v14.16b  \n" /* choose*/             \
+  "bif    v31.16b,  v17.16b,  v16.16b  \n" /* choose*/             \
+  "b      9f                    \n"
+
+#define SPARSE_F32_F32_W8_SEMI2_V8_HARD_SWISH                    \
+  /* do hard_swish */                                            \
+  "12: \n"                                                       \
+  "movi   v0.4s,    #0                \n"    /* for hardswish */ \
+  "ldr    q1,  [%[hs_param], #0]         \n" /* offset */        \
+  "ldr    q2,  [%[hs_param], #16]        \n" /* scale */         \
+  "ldr    q3,  [%[hs_param], #32]        \n" /* threshold */     \
+  "fadd   v12.4s,  v28.4s, v1.4s        \n"                      \
+  "fadd   v14.4s,  v29.4s, v1.4s        \n"                      \
+  "fadd   v16.4s,  v30.4s, v1.4s        \n"                      \
+  "fadd   v18.4s,  v31.4s, v1.4s        \n"                      \
+  "fmul   v13.4s,   v28.4s, v2.4s        \n"                     \
+  "fmul   v15.4s,   v29.4s, v2.4s        \n"                     \
+  "fmul   v17.4s,   v30.4s, v2.4s        \n"                     \
+  "fmul   v19.4s,   v31.4s, v2.4s        \n"                     \
+  "fmax   v12.4s,  v12.4s, v0.4s        \n"                      \
+  "fmax   v14.4s,  v14.4s, v0.4s        \n"                      \
+  "fmax   v16.4s,  v16.4s, v0.4s        \n"                      \
+  "fmax   v18.4s,  v18.4s, v0.4s        \n"                      \
+  "fmin   v12.4s,  v12.4s, v3.4s        \n"                      \
+  "fmin   v14.4s,  v14.4s, v3.4s        \n"                      \
+  "fmin   v16.4s,  v16.4s, v3.4s        \n"                      \
+  "fmin   v18.4s,  v18.4s, v3.4s        \n"                      \
+  "fmul   v28.4s,  v13.4s,  v12.4s        \n"                    \
+  "fmul   v29.4s,  v15.4s,  v14.4s        \n"                    \
+  "fmul   v30.4s,  v17.4s,  v16.4s        \n"                    \
+  "fmul   v31.4s,  v19.4s,  v18.4s        \n"                    \
+  "9:\n"
+
+#define SPARSE_F32_F32_W4_SEMI2_V8_LEAKY_RELU                      \
+  /* do relu */                                                    \
+  "11: \n"                                                         \
+  "cmp    %w[vflag_act],  #3       \n"                             \
+  "bne    12f                     \n"                              \
+  "movi   v0.4s, #0\n"                     /* for relu6 */         \
+  "dup    v1.4s,  %w[valpha]\n"            /* leakey relu alpha */ \
+  "fcmge  v14.4s,   v30.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v15.4s,   v30.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v16.4s,   v31.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v17.4s,   v31.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "bif    v30.16b,  v15.16b,  v14.16b  \n" /* choose*/             \
+  "bif    v31.16b,  v17.16b,  v16.16b  \n" /* choose*/             \
+  "b      9f                    \n"
+
+#define SPARSE_F32_F32_W4_SEMI2_V8_HARD_SWISH                    \
+  /* do hard_swish */                                            \
+  "12: \n"                                                       \
+  "movi   v0.4s,    #0                \n"    /* for hardswish */ \
+  "ldr    q1,  [%[hs_param], #0]         \n" /* offset */        \
+  "ldr    q2,  [%[hs_param], #16]        \n" /* scale */         \
+  "ldr    q3,  [%[hs_param], #32]        \n" /* threshold */     \
+  "fadd   v16.4s,  v30.4s, v1.4s        \n"                      \
+  "fadd   v18.4s,  v31.4s, v1.4s        \n"                      \
+  "fmul   v17.4s,   v30.4s, v2.4s        \n"                     \
+  "fmul   v19.4s,   v31.4s, v2.4s        \n"                     \
+  "fmax   v16.4s,  v16.4s, v0.4s        \n"                      \
+  "fmax   v18.4s,  v18.4s, v0.4s        \n"                      \
+  "fmin   v16.4s,  v16.4s, v3.4s        \n"                      \
+  "fmin   v18.4s,  v18.4s, v3.4s        \n"                      \
+  "fmul   v30.4s,  v17.4s,  v16.4s        \n"                    \
+  "fmul   v31.4s,  v19.4s,  v18.4s        \n"                    \
+  "9:\n"
+
+#define SPARSE_F32_F32_W48_SEMI1_V8_LEAKY_RELU                      \
+  /* do relu */                                                     \
+  "11: \n"                                                          \
+  "cmp    %w[vflag_act],  #3       \n"                              \
+  "bne    12f                     \n"                               \
+  "movi   v0.4s, #0\n"                      /* for relu6 */         \
+  "dup    v1.4s,  %w[valpha]\n"             /* leakey relu alpha */ \
+  "fcmge  v2.4s,    v20.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v3.4s,    v20.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v4.4s,    v21.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v5.4s,    v21.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v6.4s,    v22.4s,   v0.4s   \n"   /* vcgeq_f32 */         \
+  "fmul   v7.4s,    v22.4s,   v1.4s   \n"   /* vmulq_f32 */         \
+  "fcmge  v8.4s,    v23.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v9.4s,    v23.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v10.4s,   v24.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v11.4s,   v24.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v12.4s,   v25.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v13.4s,   v25.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v14.4s,   v26.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v15.4s,   v26.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v16.4s,   v27.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v17.4s,   v27.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v18.4s,   v28.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v19.4s,   v28.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "bif    v20.16b,  v3.16b,   v2.16b  \n"   /* choose*/             \
+  "bif    v21.16b,  v5.16b,   v4.16b  \n"   /* choose*/             \
+  "bif    v22.16b,  v7.16b,   v6.16b  \n"   /* choose*/             \
+  "bif    v23.16b,  v9.16b,   v8.16b  \n"   /* choose*/             \
+  "bif    v24.16b,  v11.16b,   v10.16b  \n" /* choose*/             \
+  "bif    v25.16b,  v13.16b,   v12.16b  \n" /* choose*/             \
+  "bif    v26.16b,  v15.16b,   v14.16b  \n" /* choose*/             \
+  "bif    v27.16b,  v17.16b,   v16.16b  \n" /* choose*/             \
+  "bif    v28.16b,  v19.16b,   v18.16b  \n" /* choose*/             \
+  "fcmge  v8.4s,    v29.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v9.4s,    v29.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v10.4s,   v30.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v11.4s,   v30.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v12.4s,   v31.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v13.4s,   v31.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "bif    v29.16b,  v9.16b,   v8.16b  \n"   /* choose*/             \
+  "bif    v30.16b,  v11.16b,   v10.16b  \n" /* choose*/             \
+  "bif    v31.16b,  v13.16b,   v12.16b  \n" /* choose*/             \
+  "b      9f                    \n"
+
+#define SPARSE_F32_F32_W48_SEMI1_V8_HARD_SWISH                   \
+  /* do hard_swish */                                            \
+  "12: \n"                                                       \
+  "movi   v0.4s,    #0                \n"    /* for hardswish */ \
+  "ldr    q1,  [%[hs_param], #0]         \n" /* offset */        \
+  "ldr    q2,  [%[hs_param], #16]        \n" /* scale */         \
+  "ldr    q3,  [%[hs_param], #32]        \n" /* threshold */     \
+  "fadd   v4.4s,  v20.4s, v1.4s        \n"                       \
+  "fadd   v6.4s,  v21.4s, v1.4s        \n"                       \
+  "fadd   v8.4s,  v22.4s, v1.4s        \n"                       \
+  "fadd   v10.4s,  v23.4s, v1.4s        \n"                      \
+  "fadd   v12.4s,  v24.4s, v1.4s        \n"                      \
+  "fadd   v14.4s,  v25.4s, v1.4s        \n"                      \
+  "fadd   v16.4s,  v26.4s, v1.4s        \n"                      \
+  "fadd   v18.4s,  v27.4s, v1.4s        \n"                      \
+  "fmul   v5.4s,   v20.4s, v2.4s        \n"                      \
+  "fmul   v7.4s,   v21.4s, v2.4s        \n"                      \
+  "fmul   v9.4s,   v22.4s, v2.4s        \n"                      \
+  "fmul   v11.4s,  v23.4s, v2.4s        \n"                      \
+  "fmul   v13.4s,  v24.4s, v2.4s        \n"                      \
+  "fmul   v15.4s,  v25.4s, v2.4s        \n"                      \
+  "fmul   v17.4s,  v26.4s, v2.4s        \n"                      \
+  "fmul   v19.4s,  v27.4s, v2.4s        \n"                      \
+  "fmax   v4.4s,  v4.4s, v0.4s        \n"                        \
+  "fmax   v6.4s,  v6.4s, v0.4s        \n"                        \
+  "fmax   v8.4s,  v8.4s, v0.4s        \n"                        \
+  "fmax   v10.4s, v10.4s, v0.4s       \n"                        \
+  "fmax   v12.4s, v12.4s, v0.4s       \n"                        \
+  "fmax   v14.4s, v14.4s, v0.4s       \n"                        \
+  "fmax   v16.4s, v16.4s, v0.4s       \n"                        \
+  "fmax   v18.4s, v18.4s, v0.4s       \n"                        \
+  "fmin   v4.4s,  v4.4s, v3.4s        \n"                        \
+  "fmin   v6.4s,  v6.4s, v3.4s        \n"                        \
+  "fmin   v8.4s,  v8.4s, v3.4s        \n"                        \
+  "fmin   v10.4s, v10.4s, v3.4s       \n"                        \
+  "fmin   v12.4s, v12.4s, v3.4s       \n"                        \
+  "fmin   v14.4s, v14.4s, v3.4s       \n"                        \
+  "fmin   v16.4s, v16.4s, v3.4s       \n"                        \
+  "fmin   v18.4s, v18.4s, v3.4s       \n"                        \
+  "fmul   v20.4s,  v5.4s,  v4.4s        \n"                      \
+  "fmul   v21.4s,  v7.4s,  v6.4s        \n"                      \
+  "fmul   v22.4s,  v9.4s,  v8.4s        \n"                      \
+  "fmul   v23.4s,  v11.4s, v10.4s        \n"                     \
+  "fmul   v24.4s,  v13.4s, v12.4s        \n"                     \
+  "fmul   v25.4s,  v15.4s, v14.4s        \n"                     \
+  "fmul   v26.4s,  v17.4s, v16.4s        \n"                     \
+  "fmul   v27.4s,  v19.4s, v18.4s        \n"                     \
+  "fadd   v4.4s,  v28.4s, v1.4s        \n"                       \
+  "fadd   v6.4s,  v29.4s, v1.4s        \n"                       \
+  "fadd   v8.4s,  v30.4s, v1.4s        \n"                       \
+  "fadd   v10.4s,  v31.4s, v1.4s        \n"                      \
+  "fmul   v5.4s,   v28.4s, v2.4s        \n"                      \
+  "fmul   v7.4s,   v29.4s, v2.4s        \n"                      \
+  "fmul   v9.4s,   v30.4s, v2.4s        \n"                      \
+  "fmul   v11.4s,  v31.4s, v2.4s        \n"                      \
+  "fmax   v4.4s,  v4.4s, v0.4s        \n"                        \
+  "fmax   v6.4s,  v6.4s, v0.4s        \n"                        \
+  "fmax   v8.4s,  v8.4s, v0.4s        \n"                        \
+  "fmax   v10.4s, v10.4s, v0.4s       \n"                        \
+  "fmin   v4.4s,  v4.4s, v3.4s        \n"                        \
+  "fmin   v6.4s,  v6.4s, v3.4s        \n"                        \
+  "fmin   v8.4s,  v8.4s, v3.4s        \n"                        \
+  "fmin   v10.4s, v10.4s, v3.4s       \n"                        \
+  "fmul   v28.4s,  v5.4s,  v4.4s        \n"                      \
+  "fmul   v29.4s,  v7.4s,  v6.4s        \n"                      \
+  "fmul   v30.4s,  v9.4s,  v8.4s        \n"                      \
+  "fmul   v31.4s,  v11.4s, v10.4s        \n"                     \
+  "9:\n"
+
+#define SPARSE_F32_F32_W32_SEMI1_V8_LEAKY_RELU                      \
+  /* do relu */                                                     \
+  "11: \n"                                                          \
+  "cmp    %w[vflag_act],  #3       \n"                              \
+  "bne    12f                     \n"                               \
+  "movi   v0.4s, #0\n"                      /* for relu6 */         \
+  "dup    v1.4s,  %w[valpha]\n"             /* leakey relu alpha */ \
+  "fcmge  v2.4s,    v20.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v3.4s,    v20.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v4.4s,    v21.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v5.4s,    v21.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v6.4s,    v22.4s,   v0.4s   \n"   /* vcgeq_f32 */         \
+  "fmul   v7.4s,    v22.4s,   v1.4s   \n"   /* vmulq_f32 */         \
+  "fcmge  v8.4s,    v23.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v9.4s,    v23.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v10.4s,   v24.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v11.4s,   v24.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v12.4s,   v25.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v13.4s,   v25.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v14.4s,   v26.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v15.4s,   v26.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v16.4s,   v27.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v17.4s,   v27.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "bif    v20.16b,  v3.16b,   v2.16b  \n"   /* choose*/             \
+  "bif    v21.16b,  v5.16b,   v4.16b  \n"   /* choose*/             \
+  "bif    v22.16b,  v7.16b,   v6.16b  \n"   /* choose*/             \
+  "bif    v23.16b,  v9.16b,   v8.16b  \n"   /* choose*/             \
+  "bif    v24.16b,  v11.16b,   v10.16b  \n" /* choose*/             \
+  "bif    v25.16b,  v13.16b,   v12.16b  \n" /* choose*/             \
+  "bif    v26.16b,  v15.16b,   v14.16b  \n" /* choose*/             \
+  "bif    v27.16b,  v17.16b,   v16.16b  \n" /* choose*/             \
+  "b      9f                    \n"
+
+#define SPARSE_F32_F32_W32_SEMI1_V8_HARD_SWISH                   \
+  /* do hard_swish */                                            \
+  "12: \n"                                                       \
+  "movi   v0.4s,    #0                \n"    /* for hardswish */ \
+  "ldr    q1,  [%[hs_param], #0]         \n" /* offset */        \
+  "ldr    q2,  [%[hs_param], #16]        \n" /* scale */         \
+  "ldr    q3,  [%[hs_param], #32]        \n" /* threshold */     \
+  "fadd   v4.4s,  v20.4s, v1.4s        \n"                       \
+  "fadd   v6.4s,  v21.4s, v1.4s        \n"                       \
+  "fadd   v8.4s,  v22.4s, v1.4s        \n"                       \
+  "fadd   v10.4s,  v23.4s, v1.4s        \n"                      \
+  "fadd   v12.4s,  v24.4s, v1.4s        \n"                      \
+  "fadd   v14.4s,  v25.4s, v1.4s        \n"                      \
+  "fadd   v16.4s,  v26.4s, v1.4s        \n"                      \
+  "fadd   v18.4s,  v27.4s, v1.4s        \n"                      \
+  "fmul   v5.4s,   v20.4s, v2.4s        \n"                      \
+  "fmul   v7.4s,   v21.4s, v2.4s        \n"                      \
+  "fmul   v9.4s,   v22.4s, v2.4s        \n"                      \
+  "fmul   v11.4s,  v23.4s, v2.4s        \n"                      \
+  "fmul   v13.4s,  v24.4s, v2.4s        \n"                      \
+  "fmul   v15.4s,  v25.4s, v2.4s        \n"                      \
+  "fmul   v17.4s,  v26.4s, v2.4s        \n"                      \
+  "fmul   v19.4s,  v27.4s, v2.4s        \n"                      \
+  "fmax   v4.4s,  v4.4s, v0.4s        \n"                        \
+  "fmax   v6.4s,  v6.4s, v0.4s        \n"                        \
+  "fmax   v8.4s,  v8.4s, v0.4s        \n"                        \
+  "fmax   v10.4s, v10.4s, v0.4s       \n"                        \
+  "fmax   v12.4s, v12.4s, v0.4s       \n"                        \
+  "fmax   v14.4s, v14.4s, v0.4s       \n"                        \
+  "fmax   v16.4s, v16.4s, v0.4s       \n"                        \
+  "fmax   v18.4s, v18.4s, v0.4s       \n"                        \
+  "fmin   v4.4s,  v4.4s, v3.4s        \n"                        \
+  "fmin   v6.4s,  v6.4s, v3.4s        \n"                        \
+  "fmin   v8.4s,  v8.4s, v3.4s        \n"                        \
+  "fmin   v10.4s, v10.4s, v3.4s       \n"                        \
+  "fmin   v12.4s, v12.4s, v3.4s       \n"                        \
+  "fmin   v14.4s, v14.4s, v3.4s       \n"                        \
+  "fmin   v16.4s, v16.4s, v3.4s       \n"                        \
+  "fmin   v18.4s, v18.4s, v3.4s       \n"                        \
+  "fmul   v20.4s,  v5.4s,  v4.4s        \n"                      \
+  "fmul   v21.4s,  v7.4s,  v6.4s        \n"                      \
+  "fmul   v22.4s,  v9.4s,  v8.4s        \n"                      \
+  "fmul   v23.4s,  v11.4s, v10.4s        \n"                     \
+  "fmul   v24.4s,  v13.4s, v12.4s        \n"                     \
+  "fmul   v25.4s,  v15.4s, v14.4s        \n"                     \
+  "fmul   v26.4s,  v17.4s, v16.4s        \n"                     \
+  "fmul   v27.4s,  v19.4s, v18.4s        \n"                     \
+  "9:\n"
+
+#define SPARSE_F32_F32_W16_SEMI1_V8_LEAKY_RELU                     \
+  /* do relu */                                                    \
+  "11: \n"                                                         \
+  "cmp    %w[vflag_act],  #3       \n"                             \
+  "bne    12f                     \n"                              \
+  "movi   v0.4s, #0\n"                     /* for relu6 */         \
+  "dup    v1.4s,  %w[valpha]\n"            /* leakey relu alpha */ \
+  "fcmge  v2.4s,    v20.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v3.4s,    v20.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v4.4s,    v21.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v5.4s,    v21.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v6.4s,    v22.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v7.4s,    v22.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v8.4s,    v23.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v9.4s,    v23.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "bif    v20.16b,  v3.16b,   v2.16b  \n"  /* choose*/             \
+  "bif    v21.16b,  v5.16b,   v4.16b  \n"  /* choose*/             \
+  "bif    v22.16b,  v7.16b,   v6.16b  \n"  /* choose*/             \
+  "bif    v23.16b,  v9.16b,   v8.16b  \n"  /* choose*/             \
+  "b      9f                    \n"
+
+#define SPARSE_F32_F32_W16_SEMI1_V8_HARD_SWISH                   \
+  /* do hard_swish */                                            \
+  "12: \n"                                                       \
+  "movi   v0.4s,    #0                \n"    /* for hardswish */ \
+  "ldr    q1,  [%[hs_param], #0]         \n" /* offset */        \
+  "ldr    q2,  [%[hs_param], #16]        \n" /* scale */         \
+  "ldr    q3,  [%[hs_param], #32]        \n" /* threshold */     \
+  "fadd   v4.4s,  v20.4s, v1.4s        \n"                       \
+  "fadd   v6.4s,  v21.4s, v1.4s        \n"                       \
+  "fadd   v8.4s,  v22.4s, v1.4s        \n"                       \
+  "fadd   v10.4s,  v23.4s, v1.4s        \n"                      \
+  "fmul   v5.4s,   v20.4s, v2.4s        \n"                      \
+  "fmul   v7.4s,   v21.4s, v2.4s        \n"                      \
+  "fmul   v9.4s,   v22.4s, v2.4s        \n"                      \
+  "fmul   v11.4s,  v23.4s, v2.4s        \n"                      \
+  "fmax   v4.4s,  v4.4s, v0.4s        \n"                        \
+  "fmax   v6.4s,  v6.4s, v0.4s        \n"                        \
+  "fmax   v8.4s,  v8.4s, v0.4s        \n"                        \
+  "fmax   v10.4s, v10.4s, v0.4s       \n"                        \
+  "fmin   v4.4s,  v4.4s, v3.4s        \n"                        \
+  "fmin   v6.4s,  v6.4s, v3.4s        \n"                        \
+  "fmin   v8.4s,  v8.4s, v3.4s        \n"                        \
+  "fmin   v10.4s, v10.4s, v3.4s       \n"                        \
+  "fmul   v20.4s,  v5.4s,  v4.4s        \n"                      \
+  "fmul   v21.4s,  v7.4s,  v6.4s        \n"                      \
+  "fmul   v22.4s,  v9.4s,  v8.4s        \n"                      \
+  "fmul   v23.4s,  v11.4s, v10.4s        \n"                     \
+  "9:\n"
+
+#define SPARSE_F32_F32_W8_SEMI1_V8_LEAKY_RELU                      \
+  /* do relu */                                                    \
+  "11: \n"                                                         \
+  "cmp    %w[vflag_act],  #3       \n"                             \
+  "bne    12f                     \n"                              \
+  "movi   v0.4s, #0\n"                     /* for relu6 */         \
+  "dup    v1.4s,  %w[valpha]\n"            /* leakey relu alpha */ \
+  "fcmge  v2.4s,    v20.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v3.4s,    v20.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v4.4s,    v21.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v5.4s,    v21.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "bif    v20.16b,  v3.16b,   v2.16b  \n"  /* choose*/             \
+  "bif    v21.16b,  v5.16b,   v4.16b  \n"  /* choose*/             \
+  "b      9f                    \n"
+
+#define SPARSE_F32_F32_W8_SEMI1_V8_HARD_SWISH                    \
+  /* do hard_swish */                                            \
+  "12: \n"                                                       \
+  "movi   v0.4s,    #0                \n"    /* for hardswish */ \
+  "ldr    q1,  [%[hs_param], #0]         \n" /* offset */        \
+  "ldr    q2,  [%[hs_param], #16]        \n" /* scale */         \
+  "ldr    q3,  [%[hs_param], #32]        \n" /* threshold */     \
+  "fadd   v4.4s,  v20.4s, v1.4s        \n"                       \
+  "fadd   v6.4s,  v21.4s, v1.4s        \n"                       \
+  "fmul   v5.4s,   v20.4s, v2.4s        \n"                      \
+  "fmul   v7.4s,   v21.4s, v2.4s        \n"                      \
+  "fmax   v4.4s,  v4.4s, v0.4s        \n"                        \
+  "fmax   v6.4s,  v6.4s, v0.4s        \n"                        \
+  "fmin   v4.4s,  v4.4s, v3.4s        \n"                        \
+  "fmin   v6.4s,  v6.4s, v3.4s        \n"                        \
+  "fmul   v20.4s,  v5.4s,  v4.4s        \n"                      \
+  "fmul   v21.4s,  v7.4s,  v6.4s        \n"                      \
+  "9:\n"
+
+#define SPARSE_F32_F32_W4_SEMI1_V8_LEAKY_RELU                      \
+  /* do relu */                                                    \
+  "11: \n"                                                         \
+  "cmp    %w[vflag_act],  #3       \n"                             \
+  "bne    12f                     \n"                              \
+  "movi   v0.4s, #0\n"                     /* for relu6 */         \
+  "dup    v1.4s,  %w[valpha]\n"            /* leakey relu alpha */ \
+  "fcmge  v2.4s,    v20.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v3.4s,    v20.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "bif    v20.16b,  v3.16b,   v2.16b  \n"  /* choose*/             \
+  "b      9f                    \n"
+
+#define SPARSE_F32_F32_W4_SEMI1_V8_HARD_SWISH                    \
+  /* do hard_swish */                                            \
+  "12: \n"                                                       \
+  "movi   v0.4s,    #0                \n"    /* for hardswish */ \
+  "ldr    q1,  [%[hs_param], #0]         \n" /* offset */        \
+  "ldr    q2,  [%[hs_param], #16]        \n" /* scale */         \
+  "ldr    q3,  [%[hs_param], #32]        \n" /* threshold */     \
+  "fadd   v4.4s,  v20.4s, v1.4s        \n"                       \
+  "fmul   v5.4s,   v20.4s, v2.4s        \n"                      \
+  "fmax   v4.4s,  v4.4s, v0.4s        \n"                        \
+  "fmin   v4.4s,  v4.4s, v3.4s        \n"                        \
+  "fmul   v20.4s,  v5.4s,  v4.4s        \n"                      \
+  "9:\n"
+
+/**
+ * The data block size for sparse matrix calculation is Mx48, that is, the
+ * parameter
+ * matrix size is MxK, the activation matrix is Kx48, and the required data is
+ * MxKxKx48.
+ */
+#define SPARSE_F32_F32_W48_SEMI2_V8_OUT  \
+  SPARSE_F32_F32_W48_SEMI2_V8_KERNEL     \
+  SPARSE_F32_F32_W48_SEMI2_V8_RELU       \
+  SPARSE_F32_F32_W48_SEMI2_V8_RELU6      \
+  SPARSE_F32_F32_W48_SEMI2_V8_LEAKY_RELU \
+  SPARSE_F32_F32_W48_SEMI2_V8_HARD_SWISH \
+  /* store result */                     \
+  "stp   q8,  q9,  [%[c_ptr1]]\n"        \
+  "stp   q10, q11,  [%[c_ptr1], #32]\n"  \
+  "stp   q12, q13,  [%[c_ptr1], #64]\n"  \
+  "stp   q14, q15,  [%[c_ptr1], #96]\n"  \
+  "stp   q16, q17,  [%[c_ptr1], #128]\n" \
+  "stp   q18, q19,  [%[c_ptr1], #160]\n" \
+  "stp   q20, q21,  [%[c_ptr2]]\n"       \
+  "stp   q22, q23,  [%[c_ptr2], #32]\n"  \
+  "stp   q24, q25,  [%[c_ptr2], #64]\n"  \
+  "stp   q26, q27,  [%[c_ptr2], #96]\n"  \
+  "stp   q28, q29,  [%[c_ptr2], #128]\n" \
+  "stp   q30, q31,  [%[c_ptr2], #160]\n"
+
+#define SPARSE_F32_F32_W48_SEMI1_V8_OUT  \
+  SPARSE_F32_F32_W48_SEMI1_V8_KERNEL     \
+  SPARSE_F32_F32_W48_SEMI1_V8_RELU       \
+  SPARSE_F32_F32_W48_SEMI1_V8_RELU6      \
+  SPARSE_F32_F32_W48_SEMI1_V8_LEAKY_RELU \
+  SPARSE_F32_F32_W48_SEMI1_V8_HARD_SWISH \
+  /* store result */                     \
+  "stp   q20, q21,  [%[c_ptr]]\n"        \
+  "stp   q22, q23,  [%[c_ptr], #32]\n"   \
+  "stp   q24, q25,  [%[c_ptr], #64]\n"   \
+  "stp   q26, q27,  [%[c_ptr], #96]\n"   \
+  "stp   q28, q29,  [%[c_ptr], #128]\n"  \
+  "stp   q30, q31,  [%[c_ptr], #160]\n"
+
+/**
+ * The data block size for sparse matrix calculation is Mx32, that is, the
+ * parameter
+ * matrix size is MxK, the activation matrix is Kx48, and the required data is
+ * MxKxKx32.
+ */
+#define SPARSE_F32_F32_W32_SEMI2_V8_OUT  \
+  SPARSE_F32_F32_W32_SEMI2_V8_KERNEL     \
+  SPARSE_F32_F32_W32_SEMI2_V8_RELU       \
+  SPARSE_F32_F32_W32_SEMI2_V8_RELU6      \
+  SPARSE_F32_F32_W32_SEMI2_V8_LEAKY_RELU \
+  SPARSE_F32_F32_W32_SEMI2_V8_HARD_SWISH \
+  /* store result */                     \
+  "stp   q16, q17,  [%[c_ptr1]]\n"       \
+  "stp   q18, q19,  [%[c_ptr1], #32]\n"  \
+  "stp   q20, q21,  [%[c_ptr1], #64]\n"  \
+  "stp   q22, q23,  [%[c_ptr1], #96]\n"  \
+  "stp   q24, q25,  [%[c_ptr2]]\n"       \
+  "stp   q26, q27,  [%[c_ptr2], #32]\n"  \
+  "stp   q28, q29,  [%[c_ptr2], #64]\n"  \
+  "stp   q30, q31,  [%[c_ptr2], #96]\n"
+
+#define SPARSE_F32_F32_W32_SEMI1_V8_OUT  \
+  SPARSE_F32_F32_W32_SEMI1_V8_KERNEL     \
+  SPARSE_F32_F32_W32_SEMI1_V8_RELU       \
+  SPARSE_F32_F32_W32_SEMI1_V8_RELU6      \
+  SPARSE_F32_F32_W32_SEMI1_V8_LEAKY_RELU \
+  SPARSE_F32_F32_W32_SEMI1_V8_HARD_SWISH \
+  /* store result */                     \
+  "stp   q20, q21,  [%[c_ptr]]\n"        \
+  "stp   q22, q23,  [%[c_ptr], #32]\n"   \
+  "stp   q24, q25,  [%[c_ptr], #64]\n"   \
+  "stp   q26, q27,  [%[c_ptr], #96]\n"
+
+/**
+ * The data block size for sparse matrix calculation is Mx16, that is, the
+ * parameter
+ * matrix size is MxK, the activation matrix is Kx16, and the required data is
+ * MxKxKx16.
+ */
+#define SPARSE_F32_F32_W16_SEMI2_V8_OUT  \
+  SPARSE_F32_F32_W16_SEMI2_V8_KERNEL     \
+  SPARSE_F32_F32_W16_SEMI2_V8_RELU       \
+  SPARSE_F32_F32_W16_SEMI2_V8_RELU6      \
+  SPARSE_F32_F32_W16_SEMI2_V8_LEAKY_RELU \
+  SPARSE_F32_F32_W16_SEMI2_V8_HARD_SWISH \
+  /* store result */                     \
+  "stp   q24, q25,  [%[c_ptr1]]\n"       \
+  "stp   q26, q27,  [%[c_ptr1], #32]\n"  \
+  "stp   q28, q29,  [%[c_ptr2]]\n"       \
+  "stp   q30, q31,  [%[c_ptr2], #32]\n"
+
+#define SPARSE_F32_F32_W16_SEMI1_V8_OUT  \
+  SPARSE_F32_F32_W16_SEMI1_V8_KERNEL     \
+  SPARSE_F32_F32_W16_SEMI1_V8_RELU       \
+  SPARSE_F32_F32_W16_SEMI1_V8_RELU6      \
+  SPARSE_F32_F32_W16_SEMI1_V8_LEAKY_RELU \
+  SPARSE_F32_F32_W16_SEMI1_V8_HARD_SWISH \
+  /* store result */                     \
+  "stp   q20, q21,  [%[c_ptr]]\n"        \
+  "stp   q22, q23,  [%[c_ptr], #32]\n"
+
+/**
+ * The data block size for sparse matrix calculation is Mx8, that is, the
+ * parameter
+ * matrix size is MxK, the activation matrix is Kx8, and the required data is
+ * MxKxKx8.
+ */
+#define SPARSE_F32_F32_W8_SEMI2_V8_OUT  \
+  SPARSE_F32_F32_W8_SEMI2_V8_KERNEL     \
+  SPARSE_F32_F32_W8_SEMI2_V8_RELU       \
+  SPARSE_F32_F32_W8_SEMI2_V8_RELU6      \
+  SPARSE_F32_F32_W8_SEMI2_V8_LEAKY_RELU \
+  SPARSE_F32_F32_W8_SEMI2_V8_HARD_SWISH \
+  /* store result */                    \
+  "stp   q28, q29,  [%[c_ptr1]]\n"      \
+  "stp   q30, q31,  [%[c_ptr2]]\n"
+
+#define SPARSE_F32_F32_W8_SEMI1_V8_OUT  \
+  SPARSE_F32_F32_W8_SEMI1_V8_KERNEL     \
+  SPARSE_F32_F32_W8_SEMI1_V8_RELU       \
+  SPARSE_F32_F32_W8_SEMI1_V8_RELU6      \
+  SPARSE_F32_F32_W8_SEMI1_V8_LEAKY_RELU \
+  SPARSE_F32_F32_W8_SEMI1_V8_HARD_SWISH \
+  /* store result */                    \
+  "stp   q20, q21,  [%[c_ptr]]\n"
+
+/**
+ * The data block size for sparse matrix calculation is Mx4, that is, the
+ * parameter
+ * matrix size is MxK, the activation matrix is Kx4, and the required data is
+ * MxKxKx4.
+ */
+#define SPARSE_F32_F32_W4_SEMI2_V8_OUT  \
+  SPARSE_F32_F32_W4_SEMI2_V8_KERNEL     \
+  SPARSE_F32_F32_W4_SEMI2_V8_RELU       \
+  SPARSE_F32_F32_W4_SEMI2_V8_RELU6      \
+  SPARSE_F32_F32_W4_SEMI2_V8_LEAKY_RELU \
+  SPARSE_F32_F32_W4_SEMI2_V8_HARD_SWISH \
+  /* store result */                    \
+  "str   q30,  [%[c_ptr1]]\n"           \
+  "str   q31,  [%[c_ptr2]]\n"
+
+#define SPARSE_F32_F32_W4_SEMI1_V8_OUT  \
+  SPARSE_F32_F32_W4_SEMI1_V8_KERNEL     \
+  SPARSE_F32_F32_W4_SEMI1_V8_RELU       \
+  SPARSE_F32_F32_W4_SEMI1_V8_RELU6      \
+  SPARSE_F32_F32_W4_SEMI1_V8_LEAKY_RELU \
+  SPARSE_F32_F32_W4_SEMI1_V8_HARD_SWISH \
+  /* store result */                    \
+  "str   q20,  [%[c_ptr]]\n"
+
+#define INIT_SEMI_CONV_PARAM(Dtype_i, Dtype_o, start_w)                     \
+  auto act_param = param.activation_param;                                  \
+  auto act_type = act_param.active_type;                                    \
+  volatile float alpha = 0.f;                                               \
+  int flag_act = 0x00; /* relu: 1, relu6: 2, leakey: 3  */                  \
+  float hs_param[12] = {0.f};                                               \
+  if (act_param.has_active) {                                               \
+    if (act_type == lite_api::ActivationType::kRelu) {                      \
+      flag_act = 0x01;                                                      \
+    } else if (act_type == lite_api::ActivationType::kRelu6) {              \
+      flag_act = 0x02;                                                      \
+      alpha = act_param.Relu_clipped_coef;                                  \
+    } else if (act_type == lite_api::ActivationType::kLeakyRelu) {          \
+      flag_act = 0x03;                                                      \
+      alpha = act_param.Leaky_relu_alpha;                                   \
+    } else if (act_type == lite_api::ActivationType::kHardSwish) {          \
+      flag_act = 0x04;                                                      \
+      for (int i = 0; i < 4; i++) {                                         \
+        hs_param[i] = act_param.hard_swish_offset;                          \
+        hs_param[i + 4] = 1.0 / act_param.hard_swish_scale;                 \
+        hs_param[i + 8] = act_param.hard_swish_threshold;                   \
+      }                                                                     \
+    }                                                                       \
+  }                                                                         \
+  int flag_bias = (bias != nullptr) ? 1 : 0;                                \
+  size_t mc = N * sizeof(Dtype_i);                                          \
+  size_t nc = M;                                                            \
+  size_t output_stride = N * sizeof(Dtype_o);                               \
+  size_t output_decrement = output_stride * nc - start_w * sizeof(Dtype_o); \
+  size_t pair_num = M / 2;                                                  \
+  size_t lave_num = M % 2;
+
+#define GET_SEMI_PARAM_TABLE(Dtype_i, Dtype_o)                                 \
+  Dtype_o* out_ptr1 =                                                          \
+      reinterpret_cast<Dtype_o*>((uintptr_t)output + output_stride * 2 * i);   \
+  Dtype_o* out_ptr2 = reinterpret_cast<Dtype_o*>((uintptr_t)output +           \
+                                                 output_stride * (2 * i + 1)); \
+  const Dtype_i* cur_w = A;                                                    \
+  uint32_t nnz = nidx_nnzmap[i];                                               \
+  const Dtype_i* cur_b = B;                                                    \
+  const int32_t* dmap = widx_dmap;                                             \
+  if (i != 0) {                                                                \
+    cur_w = A + nidx_nnzmap[i - 1] * 2;                                        \
+    nnz = nidx_nnzmap[i] - nidx_nnzmap[i - 1];                                 \
+    cur_b += ((nidx_nnzmap[i - 1] == 0)                                        \
+                  ? 0                                                          \
+                  : (widx_dmap[nidx_nnzmap[i - 1] - 1] / sizeof(Dtype_i)));    \
+    dmap = widx_dmap + nidx_nnzmap[i - 1];                                     \
+  }                                                                            \
+  const float* pbias = (bias != nullptr) ? (bias + i * 2) : bias_zero;
+
+#define GET_UNSTRUCT_PARAM_TABLE(Dtype_i, Dtype_o)                             \
+  Dtype_o* out_ptr = reinterpret_cast<Dtype_o*>((uintptr_t)output +            \
+                                                output_stride * 2 * pair_num); \
+  const Dtype_i* cur_w = A;                                                    \
+  uint32_t nnz = nidx_nnzmap[pair_num];                                        \
+  const Dtype_i* cur_b = B;                                                    \
+  const int32_t* dmap = widx_dmap;                                             \
+  if (pair_num != 0) {                                                         \
+    cur_w = A + nidx_nnzmap[pair_num - 1] * 2;                                 \
+    nnz = nidx_nnzmap[pair_num] - nidx_nnzmap[pair_num - 1];                   \
+    cur_b +=                                                                   \
+        ((nidx_nnzmap[pair_num - 1] == 0)                                      \
+             ? 0                                                               \
+             : (widx_dmap[nidx_nnzmap[pair_num - 1] - 1] / sizeof(Dtype_i)));  \
+    dmap = widx_dmap + nidx_nnzmap[pair_num - 1];                              \
+  }                                                                            \
+  float vbias = (bias != nullptr) ? bias[pair_num * 2] : 0.0;
+
+#define SET_ASSM_INPUT_PARAM1_V8_F32 \
+: [a_ptr] "+r"(cur_w),                  \
+  [b_ptr] "+r"(cur_b),                  \
+  [c_ptr1] "+r"(out_ptr1),              \
+  [c_ptr2] "+r"(out_ptr2),              \
+  [k] "+r"(nnz),                        \
+  [widx_dmap] "+r"(dmap),               \
+  [bias_ptr] "+r"(pbias)                \
+: [vflag_act] "r"(flag_act),            \
+  [valpha] "r"(alpha),                  \
+  [hs_param] "r"(hs_param)
+
+#define SET_ASSM_INPUT_PARAM2_V8_F32 \
+: [a_ptr] "+r"(cur_w),                  \
+  [b_ptr] "+r"(cur_b),                  \
+  [c_ptr] "+r"(out_ptr),                \
+  [k] "+r"(nnz),                        \
+  [widx_dmap] "+r"(dmap)                \
+: [vbias] "r"(vbias),                   \
+  [vflag_act] "r"(flag_act),            \
+  [valpha] "r"(alpha),                  \
+  [hs_param] "r"(hs_param)
+
+#define COMPUTE_ACT_NEON_TWO_V8_F32                                      \
+  if (flag_act == 1) {                                                   \
+    float32x2_t vzero = vdup_n_f32(0);                                   \
+    vacc01n0 = vmax_f32(vacc01n0, vzero);                                \
+    vacc01n1 = vmax_f32(vacc01n1, vzero);                                \
+  } else if (flag_act == 2) {                                            \
+    float32x2_t vzero = vdup_n_f32(0);                                   \
+    float32x2_t aph = vdup_n_f32(alpha);                                 \
+    vacc01n0 = vmax_f32(vacc01n0, vzero);                                \
+    vacc01n1 = vmax_f32(vacc01n1, vzero);                                \
+    vacc01n0 = vmin_f32(vacc01n0, aph);                                  \
+    vacc01n1 = vmin_f32(vacc01n1, aph);                                  \
+  } else if (flag_act == 0) {                                            \
+  } else if (flag_act == 3) {                                            \
+    float32x2_t vzero = vdup_n_f32(0);                                   \
+    float32x2_t aph = vdup_n_f32(alpha);                                 \
+    uint32x2_t vflag0123 = vcge_f32(vacc01n0, vzero);                    \
+    uint32x2_t vflag4567 = vcge_f32(vacc01n1, vzero);                    \
+    float32x2_t v0123 = vmul_f32(vacc01n0, aph);                         \
+    float32x2_t v4567 = vmul_f32(vacc01n1, aph);                         \
+    vacc01n0 = vbsl_f32(vflag0123, vacc01n0, v0123);                     \
+    vacc01n1 = vbsl_f32(vflag4567, vacc01n1, v4567);                     \
+  } else {                                                               \
+    float32x2_t vzero = vdup_n_f32(0);                                   \
+    float32x2_t offset = vdup_n_f32(act_param.hard_swish_offset);        \
+    float32x2_t hs_scale = vdup_n_f32(1.0 / act_param.hard_swish_scale); \
+    float32x2_t thre = vdup_n_f32(act_param.hard_swish_threshold);       \
+    float32x2_t vset0123_1 = vadd_f32(vacc01n0, offset);                 \
+    float32x2_t vscale0123_1 = vmul_f32(vacc01n0, hs_scale);             \
+    float32x2_t vset0123_2 = vadd_f32(vacc01n1, offset);                 \
+    float32x2_t vscale0123_2 = vmul_f32(vacc01n1, hs_scale);             \
+    vset0123_1 = vmax_f32(vset0123_1, vzero);                            \
+    vset0123_1 = vmin_f32(vset0123_1, thre);                             \
+    vset0123_2 = vmax_f32(vset0123_2, vzero);                            \
+    vset0123_2 = vmin_f32(vset0123_2, thre);                             \
+    vacc01n0 = vmul_f32(vscale0123_1, vset0123_1);                       \
+    vacc01n1 = vmul_f32(vscale0123_2, vset0123_2);                       \
+  }
+
+#define COMPUTE_ACT_NEON_ONE_V8_F32                                      \
+  if (flag_act == 1) {                                                   \
+    float32x2_t vzero = vdup_n_f32(0);                                   \
+    vacc01n0 = vmax_f32(vacc01n0, vzero);                                \
+  } else if (flag_act == 2) {                                            \
+    float32x2_t vzero = vdup_n_f32(0);                                   \
+    float32x2_t aph = vdup_n_f32(alpha);                                 \
+    vacc01n0 = vmax_f32(vacc01n0, vzero);                                \
+    vacc01n0 = vmin_f32(vacc01n0, aph);                                  \
+  } else if (flag_act == 0) {                                            \
+  } else if (flag_act == 3) {                                            \
+    float32x2_t vzero = vdup_n_f32(0);                                   \
+    float32x2_t aph = vdup_n_f32(alpha);                                 \
+    uint32x2_t vflag0123 = vcge_f32(vacc01n0, vzero);                    \
+    float32x2_t v0123 = vmul_f32(vacc01n0, aph);                         \
+    vacc01n0 = vbsl_f32(vflag0123, vacc01n0, v0123);                     \
+  } else {                                                               \
+    float32x2_t vzero = vdup_n_f32(0);                                   \
+    float32x2_t offset = vdup_n_f32(act_param.hard_swish_offset);        \
+    float32x2_t hs_scale = vdup_n_f32(1.0 / act_param.hard_swish_scale); \
+    float32x2_t thre = vdup_n_f32(act_param.hard_swish_threshold);       \
+    float32x2_t vset0123 = vadd_f32(vacc01n0, offset);                   \
+    float32x2_t vscale0123 = vmul_f32(vacc01n0, hs_scale);               \
+    vset0123 = vmax_f32(vset0123, vzero);                                \
+    vset0123 = vmin_f32(vset0123, thre);                                 \
+    vacc01n0 = vmul_f32(vscale0123, vset0123);                           \
+  }
+
+/**
+ * \brief Semi-structured Sparse calculation implementation of 1x1 convolution,
+ * both input and output are f32.
+ * Semi-structured Sparse matrix multiplication is calculated in blocks, the
+ * block size is Mx48,
+ * that is,
+ * the parameter matrix is MxK, and the activation matrix is Kx48; when N is
+ * less than 48,
+ * it is calculated in blocks of Mx32, Mx16, Mx8, and Mx4 in turn;
+ * @param A Semi-structured Sparse weight data
+ * @param B dense input data
+ * @param widx_dmap An array of int32_t values storing scaled [by sizeof(input
+ * element)] difference
+ * between input channels corresponding to successive non-zero element
+ * @param nidx_nnzmap the number of non-zero kernel elements per each output
+ * channel
+ * @param bias
+ * @param output
+ * @param M
+ * @param N
+ * @param K
+ * @param param
+ * @param ctx
+ */
+void sparse_semi_conv_fp32_pipelined(const float* A,
+                                     const float* B,
+                                     const int32_t* widx_dmap,
+                                     const uint32_t* nidx_nnzmap,
+                                     const float* bias,
+                                     float* output,
+                                     const int M,
+                                     const int K,
+                                     const int N,
+                                     const operators::SparseConvParam& param,
+                                     ARMContext* ctx) {
+  INIT_SEMI_CONV_PARAM(float, float, 48)
+  float bias_zero[2] = {0.f, 0.f};
+  while
+    SPARSE_LIKELY(mc >= 48 * sizeof(float)) {
+      LITE_PARALLEL_COMMON_BEGIN(i, tid, pair_num, 0, 1) {
+        GET_SEMI_PARAM_TABLE(float, float)
+        // clang-format off
+            asm volatile(SPARSE_F32_F32_W48_SEMI2_V8_OUT  
+              SET_ASSM_INPUT_PARAM1_V8_F32
+              : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+                  "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
+                  "v16", "v17", "v18", "v19", "v21", "v22", "v23", "v24", "v25", 
+                  "v26", "v27", "v28", "v29", "v30", "v31", "w1", "x1", "cc", "memory");
+        // clang-format on
+      }
+      LITE_PARALLEL_COMMON_END();
+      if
+        SPARSE_UNLIKELY(lave_num != 0) {
+          GET_UNSTRUCT_PARAM_TABLE(float, float)
+          // clang-format off
+          asm volatile(SPARSE_F32_F32_W48_SEMI1_V8_OUT  
+            SET_ASSM_INPUT_PARAM2_V8_F32
+            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+                "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
+                "v16", "v17", "v18", "v19", "v21", "v22", "v23", "v24", "v25", 
+                "v26", "v27", "v28", "v29", "v30", "v31", "w1", "x1", "cc", "memory");
+          // clang-format on
+        }
+      output = reinterpret_cast<float*>((uintptr_t)output + 48 * sizeof(float));
+      B += 48;
+      mc -= 48 * sizeof(float);
+    }
+
+  if
+    SPARSE_UNLIKELY(mc != 0) {
+      output_decrement += 16 * sizeof(float);
+      if (mc & (32 * sizeof(float))) {
+        LITE_PARALLEL_COMMON_BEGIN(i, tid, pair_num, 0, 1) {
+          GET_SEMI_PARAM_TABLE(float, float)
+          // clang-format off
+              asm volatile(SPARSE_F32_F32_W32_SEMI2_V8_OUT  
+                SET_ASSM_INPUT_PARAM1_V8_F32
+                : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+                      "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
+                      "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", 
+                      "v26", "v27", "v28", "v29", "v30", "v31", "w1", "x1", "cc", "memory");
+          // clang-format on
+        }
+        LITE_PARALLEL_COMMON_END();
+        if
+          SPARSE_UNLIKELY(lave_num != 0) {
+            GET_UNSTRUCT_PARAM_TABLE(float, float)
+            // clang-format off
+            asm volatile(SPARSE_F32_F32_W32_SEMI1_V8_OUT  
+              SET_ASSM_INPUT_PARAM2_V8_F32
+              : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+                "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
+                "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", 
+                "v26", "v27", "w1", "x1", "cc", "memory");
+            // clang-format on
+          }
+        output =
+            reinterpret_cast<float*>((uintptr_t)output + 32 * sizeof(float));
+        B += 32;
+        mc -= 32 * sizeof(float);
+      }
+      output_decrement += 16 * sizeof(float);
+      if (mc & (16 * sizeof(float))) {
+        LITE_PARALLEL_COMMON_BEGIN(i, tid, pair_num, 0, 1) {
+          GET_SEMI_PARAM_TABLE(float, float)
+          // clang-format off
+              asm volatile(SPARSE_F32_F32_W16_SEMI2_V8_OUT  
+                SET_ASSM_INPUT_PARAM1_V8_F32
+                : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+                  "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+                  "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "w1", "x1", "cc", "memory");
+          // clang-format on
+        }
+        LITE_PARALLEL_COMMON_END();
+        if
+          SPARSE_UNLIKELY(lave_num != 0) {
+            GET_UNSTRUCT_PARAM_TABLE(float, float)
+            // clang-format off
+            asm volatile(SPARSE_F32_F32_W16_SEMI1_V8_OUT  
+              SET_ASSM_INPUT_PARAM2_V8_F32
+              : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+                "v8", "v9", "v10", "v11", "v20", "v21", "v22", "v23", "w1", "x1", "cc", "memory");
+            // clang-format on
+          }
+        output =
+            reinterpret_cast<float*>((uintptr_t)output + 16 * sizeof(float));
+        B += 16;
+        mc -= 16 * sizeof(float);
+      }
+      output_decrement += 8 * sizeof(float);
+      if (mc & (8 * sizeof(float))) {
+        LITE_PARALLEL_COMMON_BEGIN(i, tid, pair_num, 0, 1) {
+          GET_SEMI_PARAM_TABLE(float, float)
+          // clang-format off
+              asm volatile(SPARSE_F32_F32_W8_SEMI2_V8_OUT  
+                SET_ASSM_INPUT_PARAM1_V8_F32
+                : "v0", "v1", "v2", "v3", "v10", "v11", "v12", "v13", "v14", "v15",
+                  "v16", "v17", "v18", "v19", "v28", "v29", "v30", "v31", "w1", "x1", "cc", "memory");
+          // clang-format on
+        }
+        LITE_PARALLEL_COMMON_END();
+        if
+          SPARSE_UNLIKELY(lave_num != 0) {
+            GET_UNSTRUCT_PARAM_TABLE(float, float)
+            // clang-format off
+            asm volatile(SPARSE_F32_F32_W8_SEMI1_V8_OUT  
+              SET_ASSM_INPUT_PARAM2_V8_F32
+              : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "w1", "x1", "cc", "memory");
+            // clang-format on
+          }
+        output =
+            reinterpret_cast<float*>((uintptr_t)output + 8 * sizeof(float));
+        B += 8;
+        mc -= 8 * sizeof(float);
+      }
+      output_decrement += 4 * sizeof(float);
+      if (mc & (4 * sizeof(float))) {
+        LITE_PARALLEL_COMMON_BEGIN(i, tid, pair_num, 0, 1) {
+          GET_SEMI_PARAM_TABLE(float, float)
+          // clang-format off
+              asm volatile(SPARSE_F32_F32_W4_SEMI2_V8_OUT  
+                SET_ASSM_INPUT_PARAM1_V8_F32
+                : "v0", "v1", "v2", "v3", "v14", "v15", "v16", "v17", "v18", "v19", "v30", "v31", "w1", "x1", "cc", "memory");
+          // clang-format on
+        }
+        LITE_PARALLEL_COMMON_END();
+        if
+          SPARSE_UNLIKELY(lave_num != 0) {
+            GET_UNSTRUCT_PARAM_TABLE(float, float)
+            // clang-format off
+            asm volatile(SPARSE_F32_F32_W4_SEMI1_V8_OUT  
+              SET_ASSM_INPUT_PARAM2_V8_F32
+              : "v0", "v1", "v2", "v3", "v4", "v5", "v20", "w1", "x1", "cc", "memory");
+            // clang-format on
+          }
+        output =
+            reinterpret_cast<float*>((uintptr_t)output + 4 * sizeof(float));
+        B += 4;
+        mc -= 4 * sizeof(float);
+      }
+
+      output_decrement += 2 * sizeof(float);
+      if (mc & (2 * sizeof(float))) {
+        LITE_PARALLEL_COMMON_BEGIN(i, tid, pair_num, 0, 1) {
+          GET_SEMI_PARAM_TABLE(float, float)
+          float32x2_t vacc01n0 = vld1_dup_f32(pbias);
+          float32x2_t vacc01n1 = vld1_dup_f32(pbias + 1);
+          if
+            SPARSE_LIKELY(nnz != 0) {
+              do {
+                const intptr_t diff = *dmap++;
+                const float32x2_t vi01 = vld1_f32(cur_b);
+                cur_b = (const float*)((uintptr_t)cur_b + (uintptr_t)diff);
+                const float32x2_t vw = vld1_f32(cur_w);
+                cur_w += 2;
+
+                vacc01n0 = vmla_lane_f32(vacc01n0, vi01, vw, 0);
+                vacc01n1 = vmla_lane_f32(vacc01n1, vi01, vw, 1);
+              } while (--nnz != 0);
+            }
+
+          COMPUTE_ACT_NEON_TWO_V8_F32
+          vst1_f32(out_ptr1, vacc01n0);
+          vst1_f32(out_ptr2, vacc01n1);
+        }
+        LITE_PARALLEL_COMMON_END();
+        if
+          SPARSE_UNLIKELY(lave_num != 0) {
+            GET_UNSTRUCT_PARAM_TABLE(float, float)
+            float32x2_t vacc01n0 = vdup_n_f32(vbias);
+            if
+              SPARSE_LIKELY(nnz != 0) {
+                do {
+                  const intptr_t diff = *dmap++;
+                  const float32x2_t vi01 = vld1_f32(cur_b);
+                  cur_b = (const float*)((uintptr_t)cur_b + (uintptr_t)diff);
+                  const float32x2_t vw = vld1_dup_f32(cur_w);
+                  cur_w += 1;
+                  vacc01n0 = vmla_lane_f32(vacc01n0, vi01, vw, 0);
+                } while (--nnz != 0);
+              }
+            COMPUTE_ACT_NEON_ONE_V8_F32
+            vst1_f32(out_ptr, vacc01n0);
+          }
+        output =
+            reinterpret_cast<float*>((uintptr_t)output + 2 * sizeof(float));
+        B += 2;
+        mc -= 2 * sizeof(float);
+      }
+
+      output_decrement += 1 * sizeof(float);
+      if (mc & (1 * sizeof(float))) {
+        LITE_PARALLEL_COMMON_BEGIN(i, tid, pair_num, 0, 1) {
+          GET_SEMI_PARAM_TABLE(float, float)
+          float32x2_t vacc01n0 = vld1_dup_f32(pbias);
+          float32x2_t vacc01n1 = vld1_dup_f32(pbias + 1);
+          if
+            SPARSE_LIKELY(nnz != 0) {
+              do {
+                const intptr_t diff = *dmap++;
+                const float32x2_t vi01 = vld1_dup_f32(cur_b);
+                cur_b = (const float*)((uintptr_t)cur_b + (uintptr_t)diff);
+                const float32x2_t vw = vld1_f32(cur_w);
+                cur_w += 2;
+
+                vacc01n0 = vmla_lane_f32(vacc01n0, vi01, vw, 0);
+                vacc01n1 = vmla_lane_f32(vacc01n1, vi01, vw, 1);
+              } while (--nnz != 0);
+            }
+
+          COMPUTE_ACT_NEON_TWO_V8_F32
+          vst1_lane_f32(out_ptr1, vacc01n0, 0);
+          vst1_lane_f32(out_ptr2, vacc01n1, 0);
+        }
+        LITE_PARALLEL_COMMON_END();
+        if
+          SPARSE_UNLIKELY(lave_num != 0) {
+            GET_UNSTRUCT_PARAM_TABLE(float, float)
+            float32x2_t vacc01n0 = vdup_n_f32(vbias);
+            if
+              SPARSE_LIKELY(nnz != 0) {
+                do {
+                  const intptr_t diff = *dmap++;
+                  const float32x2_t vi01 = vld1_dup_f32(cur_b);
+                  cur_b = (const float*)((uintptr_t)cur_b + (uintptr_t)diff);
+                  const float32x2_t vw = vld1_dup_f32(cur_w);
+                  cur_w += 1;
+                  vacc01n0 = vmla_lane_f32(vacc01n0, vi01, vw, 0);
+                } while (--nnz != 0);
+              }
+            COMPUTE_ACT_NEON_ONE_V8_F32
+            vst1_lane_f32(out_ptr, vacc01n0, 0);
+          }
+        output =
+            reinterpret_cast<float*>((uintptr_t)output + 1 * sizeof(float));
+        B += 1;
+        mc -= 1 * sizeof(float);
+      }
+    }
+}
+
+#define SPARSE_INT8_F32_W48_V8_SEMI_KERNEL       \
+  "eor v16.16b, v0.16b,  v0.16b\n"               \
+  "eor v17.16b, v1.16b,  v1.16b\n"               \
+  "eor v18.16b, v2.16b,  v2.16b\n"               \
+  "eor v19.16b, v3.16b,  v3.16b\n"               \
+  "prfm  pldl1keep, [%[a_ptr], #128]\n"          \
+  "eor v20.16b, v4.16b,  v4.16b\n"               \
+  "eor v21.16b, v5.16b,  v5.16b\n"               \
+  "eor v22.16b, v6.16b,  v6.16b\n"               \
+  "prfm  pldl1keep, [%[widx_dmap], #128]\n"      \
+  "eor v23.16b, v7.16b,  v7.16b\n"               \
+  "eor v24.16b, v8.16b,  v8.16b\n"               \
+  "eor v25.16b, v9.16b,  v9.16b\n"               \
+  "eor v26.16b, v10.16b, v10.16b\n"              \
+  "prfm  pldl1keep, [%[b_ptr], #128]\n"          \
+  "eor v27.16b, v11.16b, v11.16b\n"              \
+  "eor v28.16b, v12.16b, v12.16b\n"              \
+  "eor v29.16b, v13.16b, v13.16b\n"              \
+  "eor v30.16b, v14.16b, v14.16b\n"              \
+  "eor v31.16b, v15.16b, v15.16b\n"              \
+  "eor v8.16b,  v0.16b,  v0.16b\n"               \
+  "eor v9.16b,  v1.16b,  v1.16b\n"               \
+  "eor v10.16b, v2.16b,  v2.16b\n"               \
+  "eor v11.16b, v3.16b,  v3.16b\n"               \
+  "eor v12.16b, v4.16b,  v4.16b\n"               \
+  "eor v13.16b, v5.16b,  v5.16b\n"               \
+  "eor v14.16b, v6.16b,  v6.16b\n"               \
+  "eor v15.16b, v7.16b,  v7.16b\n"               \
+  "cbz    %w[k],    1f\n"                        \
+  "0:\n"                                         \
+  "ld1r  {v0.16b}, [%[a_ptr]], #1\n"             \
+  "ld1r  {v1.16b}, [%[a_ptr]], #1\n"             \
+  "ldr   w1, [%[widx_dmap]],   #4\n"             \
+  "sxtw  x1,  w1\n"                              \
+  "ld1   {v2.16b, v3.16b, v4.16b}, [%[b_ptr]]\n" \
+  "subs    %w[k],   %w[k],   #1\n"               \
+  "smull   v5.8h,   v0.8b,   v2.8b\n"            \
+  "smull2  v6.8h,   v0.16b,  v2.16b\n"           \
+  "smull   v7.8h,   v0.8b,   v3.8b\n"            \
+  "saddw   v8.4s,   v8.4s,   v5.4h\n"            \
+  "saddw2  v9.4s,   v9.4s,   v5.8h\n"            \
+  "saddw   v10.4s,  v10.4s,  v6.4h\n"            \
+  "saddw2  v11.4s,  v11.4s,  v6.8h\n"            \
+  "saddw   v12.4s,  v12.4s,  v7.4h\n"            \
+  "saddw2  v13.4s,  v13.4s,  v7.8h\n"            \
+  "smull2  v5.8h,   v0.16b,  v3.16b\n"           \
+  "smull   v6.8h,   v0.8b,   v4.8b\n"            \
+  "smull2  v7.8h,   v0.16b,  v4.16b\n"           \
+  "add   %[b_ptr],  %[b_ptr], x1\n"              \
+  "saddw   v14.4s,  v14.4s,  v5.4h\n"            \
+  "saddw2  v15.4s,  v15.4s,  v5.8h\n"            \
+  "saddw   v16.4s,  v16.4s,  v6.4h\n"            \
+  "saddw2  v17.4s,  v17.4s,  v6.8h\n"            \
+  "saddw   v18.4s,  v18.4s,  v7.4h\n"            \
+  "saddw2  v19.4s,  v19.4s,  v7.8h\n"            \
+  "smull   v0.8h,   v1.8b,   v2.8b\n"            \
+  "smull2  v5.8h,   v1.16b,  v2.16b\n"           \
+  "smull   v6.8h,   v1.8b,   v3.8b\n"            \
+  "smull2  v7.8h,   v1.16b,  v3.16b\n"           \
+  "saddw   v20.4s,  v20.4s,  v0.4h\n"            \
+  "saddw2  v21.4s,  v21.4s,  v0.8h\n"            \
+  "prfm  pldl1keep, [%[b_ptr], #64]\n"           \
+  "saddw   v22.4s,  v22.4s,  v5.4h\n"            \
+  "saddw2  v23.4s,  v23.4s,  v5.8h\n"            \
+  "saddw   v24.4s,  v24.4s,  v6.4h\n"            \
+  "saddw2  v25.4s,  v25.4s,  v6.8h\n"            \
+  "saddw   v26.4s,  v26.4s,  v7.4h\n"            \
+  "saddw2  v27.4s,  v27.4s,  v7.8h\n"            \
+  "smull   v0.8h,   v1.8b,   v4.8b\n"            \
+  "smull2  v5.8h,   v1.16b,  v4.16b\n"           \
+  "saddw   v28.4s,  v28.4s,  v0.4h\n"            \
+  "saddw2  v29.4s,  v29.4s,  v0.8h\n"            \
+  "saddw   v30.4s,  v30.4s,  v5.4h\n"            \
+  "saddw2  v31.4s,  v31.4s,  v5.8h\n"            \
+  "bne     0b\n"                                 \
+  "1:\n"                                         \
+  "ld1     {v0.2s},   [%[bias_ptr]]\n"           \
+  "ld1     {v1.2s},   [%[scale_ptr]]\n"          \
+  "scvtf   v2.4s,   v8.4s\n"                     \
+  "scvtf   v3.4s,   v9.4s\n"                     \
+  "scvtf   v4.4s,   v10.4s\n"                    \
+  "scvtf   v5.4s,   v11.4s\n"                    \
+  "scvtf   v6.4s,   v12.4s\n"                    \
+  "scvtf   v7.4s,   v13.4s\n"                    \
+  "dup     v8.4s,   v0.s[0]\n"                   \
+  "dup     v9.4s,   v0.s[0]\n"                   \
+  "dup     v10.4s,  v0.s[0]\n"                   \
+  "dup     v11.4s,  v0.s[0]\n"                   \
+  "dup     v12.4s,  v0.s[0]\n"                   \
+  "dup     v13.4s,  v0.s[0]\n"                   \
+  "fmla    v8.4s,   v2.4s,   v1.s[0]\n"          \
+  "fmla    v9.4s,   v3.4s,   v1.s[0]\n"          \
+  "fmla    v10.4s,  v4.4s,   v1.s[0]\n"          \
+  "fmla    v11.4s,  v5.4s,   v1.s[0]\n"          \
+  "fmla    v12.4s,  v6.4s,   v1.s[0]\n"          \
+  "fmla    v13.4s,  v7.4s,   v1.s[0]\n"          \
+  "scvtf   v2.4s,   v14.4s\n"                    \
+  "scvtf   v3.4s,   v15.4s\n"                    \
+  "scvtf   v4.4s,   v16.4s\n"                    \
+  "scvtf   v5.4s,   v17.4s\n"                    \
+  "scvtf   v6.4s,   v18.4s\n"                    \
+  "scvtf   v7.4s,   v19.4s\n"                    \
+  "dup     v14.4s,  v0.s[0]\n"                   \
+  "dup     v15.4s,  v0.s[0]\n"                   \
+  "dup     v16.4s,  v0.s[0]\n"                   \
+  "dup     v17.4s,  v0.s[0]\n"                   \
+  "dup     v18.4s,  v0.s[0]\n"                   \
+  "dup     v19.4s,  v0.s[0]\n"                   \
+  "fmla    v14.4s,  v2.4s,   v1.s[0]\n"          \
+  "fmla    v15.4s,  v3.4s,   v1.s[0]\n"          \
+  "fmla    v16.4s,  v4.4s,   v1.s[0]\n"          \
+  "fmla    v17.4s,  v5.4s,   v1.s[0]\n"          \
+  "fmla    v18.4s,  v6.4s,   v1.s[0]\n"          \
+  "fmla    v19.4s,  v7.4s,   v1.s[0]\n"          \
+  "scvtf   v2.4s,   v20.4s\n"                    \
+  "scvtf   v3.4s,   v21.4s\n"                    \
+  "scvtf   v4.4s,   v22.4s\n"                    \
+  "scvtf   v5.4s,   v23.4s\n"                    \
+  "scvtf   v6.4s,   v24.4s\n"                    \
+  "scvtf   v7.4s,   v25.4s\n"                    \
+  "dup     v20.4s,  v0.s[1]\n"                   \
+  "dup     v21.4s,  v0.s[1]\n"                   \
+  "dup     v22.4s,  v0.s[1]\n"                   \
+  "dup     v23.4s,  v0.s[1]\n"                   \
+  "dup     v24.4s,  v0.s[1]\n"                   \
+  "dup     v25.4s,  v0.s[1]\n"                   \
+  "fmla    v20.4s,  v2.4s,   v1.s[1]\n"          \
+  "fmla    v21.4s,  v3.4s,   v1.s[1]\n"          \
+  "fmla    v22.4s,  v4.4s,   v1.s[1]\n"          \
+  "fmla    v23.4s,  v5.4s,   v1.s[1]\n"          \
+  "fmla    v24.4s,  v6.4s,   v1.s[1]\n"          \
+  "fmla    v25.4s,  v7.4s,   v1.s[1]\n"          \
+  "scvtf   v2.4s,   v26.4s\n"                    \
+  "scvtf   v3.4s,   v27.4s\n"                    \
+  "scvtf   v4.4s,   v28.4s\n"                    \
+  "scvtf   v5.4s,   v29.4s\n"                    \
+  "scvtf   v6.4s,   v30.4s\n"                    \
+  "scvtf   v7.4s,   v31.4s\n"                    \
+  "dup     v26.4s,  v0.s[1]\n"                   \
+  "dup     v27.4s,  v0.s[1]\n"                   \
+  "dup     v28.4s,  v0.s[1]\n"                   \
+  "dup     v29.4s,  v0.s[1]\n"                   \
+  "dup     v30.4s,  v0.s[1]\n"                   \
+  "dup     v31.4s,  v0.s[1]\n"                   \
+  "fmla    v26.4s,  v2.4s,   v1.s[1]\n"          \
+  "fmla    v27.4s,  v3.4s,   v1.s[1]\n"          \
+  "fmla    v28.4s,  v4.4s,   v1.s[1]\n"          \
+  "fmla    v29.4s,  v5.4s,   v1.s[1]\n"          \
+  "fmla    v30.4s,  v6.4s,   v1.s[1]\n"          \
+  "fmla    v31.4s,  v7.4s,   v1.s[1]\n"
+
+#define SPARSE_INT8_F32_W32_V8_SEMI2_KERNEL \
+  "eor v16.16b, v0.16b,  v0.16b\n"          \
+  "eor v17.16b, v1.16b,  v1.16b\n"          \
+  "eor v18.16b, v2.16b,  v2.16b\n"          \
+  "eor v19.16b, v3.16b,  v3.16b\n"          \
+  "prfm  pldl1keep, [%[a_ptr], #128]\n"     \
+  "eor v20.16b, v4.16b,  v4.16b\n"          \
+  "eor v21.16b, v5.16b,  v5.16b\n"          \
+  "eor v22.16b, v6.16b,  v6.16b\n"          \
+  "prfm  pldl1keep, [%[widx_dmap], #128]\n" \
+  "eor v23.16b, v7.16b,  v7.16b\n"          \
+  "eor v24.16b, v8.16b,  v8.16b\n"          \
+  "eor v25.16b, v9.16b,  v9.16b\n"          \
+  "eor v26.16b, v10.16b, v10.16b\n"         \
+  "prfm  pldl1keep, [%[b_ptr], #128]\n"     \
+  "eor v27.16b, v11.16b, v11.16b\n"         \
+  "eor v28.16b, v12.16b, v12.16b\n"         \
+  "eor v29.16b, v13.16b, v13.16b\n"         \
+  "eor v30.16b, v14.16b, v14.16b\n"         \
+  "eor v31.16b, v15.16b, v15.16b\n"         \
+  "cbz    %w[k],    1f\n"                   \
+  "0:\n"                                    \
+  "ld1r  {v1.16b}, [%[a_ptr]], #1\n"        \
+  "ld1r  {v2.16b}, [%[a_ptr]], #1\n"        \
+  "ldr   w1, [%[widx_dmap]],   #4\n"        \
+  "sxtw  x1,  w1\n"                         \
+  "ld1   {v3.16b, v4.16b}, [%[b_ptr]]\n"    \
+  "subs    %w[k],   %w[k],   #1\n"          \
+  "smull   v5.8h,   v1.8b,   v3.8b\n"       \
+  "smull2  v6.8h,   v1.16b,  v3.16b\n"      \
+  "smull   v7.8h,   v1.8b,   v4.8b\n"       \
+  "smull2  v8.8h,   v1.16b,  v4.16b\n"      \
+  "smull   v9.8h,   v2.8b,   v3.8b\n"       \
+  "smull2  v10.8h,  v2.16b,  v3.16b\n"      \
+  "smull   v11.8h,  v2.8b,   v4.8b\n"       \
+  "smull2  v12.8h,  v2.16b,  v4.16b\n"      \
+  "add   %[b_ptr],  %[b_ptr], x1\n"         \
+  "saddw   v16.4s,  v16.4s,  v5.4h\n"       \
+  "saddw2  v17.4s,  v17.4s,  v5.8h\n"       \
+  "saddw   v18.4s,  v18.4s,  v6.4h\n"       \
+  "saddw2  v19.4s,  v19.4s,  v6.8h\n"       \
+  "saddw   v20.4s,  v20.4s,  v7.4h\n"       \
+  "saddw2  v21.4s,  v21.4s,  v7.8h\n"       \
+  "prfm  pldl1keep, [%[b_ptr], #64]\n"      \
+  "saddw   v22.4s,  v22.4s,  v8.4h\n"       \
+  "saddw2  v23.4s,  v23.4s,  v8.8h\n"       \
+  "saddw   v24.4s,  v24.4s,  v9.4h\n"       \
+  "saddw2  v25.4s,  v25.4s,  v9.8h\n"       \
+  "saddw   v26.4s,  v26.4s,  v10.4h\n"      \
+  "saddw2  v27.4s,  v27.4s,  v10.8h\n"      \
+  "saddw   v28.4s,  v28.4s,  v11.4h\n"      \
+  "saddw2  v29.4s,  v29.4s,  v11.8h\n"      \
+  "saddw   v30.4s,  v30.4s,  v12.4h\n"      \
+  "saddw2  v31.4s,  v31.4s,  v12.8h\n"      \
+  "bne     0b\n"                            \
+  "1:\n"                                    \
+  "scvtf   v0.4s,   v16.4s\n"               \
+  "scvtf   v1.4s,   v17.4s\n"               \
+  "scvtf   v2.4s,   v18.4s\n"               \
+  "scvtf   v3.4s,   v19.4s\n"               \
+  "scvtf   v4.4s,   v20.4s\n"               \
+  "scvtf   v5.4s,   v21.4s\n"               \
+  "scvtf   v6.4s,   v22.4s\n"               \
+  "scvtf   v7.4s,   v23.4s\n"               \
+  "scvtf   v8.4s,   v24.4s\n"               \
+  "scvtf   v9.4s,   v25.4s\n"               \
+  "scvtf   v10.4s,  v26.4s\n"               \
+  "scvtf   v11.4s,  v27.4s\n"               \
+  "scvtf   v12.4s,  v28.4s\n"               \
+  "scvtf   v13.4s,  v29.4s\n"               \
+  "scvtf   v14.4s,  v30.4s\n"               \
+  "scvtf   v15.4s,  v31.4s\n"               \
+  "ld1     {v31.2s},   [%[bias_ptr]]\n"     \
+  "ld1     {v30.2s},   [%[scale_ptr]]\n"    \
+  "dup     v16.4s,  v31.s[0]\n"             \
+  "dup     v17.4s,  v31.s[0]\n"             \
+  "dup     v18.4s,  v31.s[0]\n"             \
+  "dup     v19.4s,  v31.s[0]\n"             \
+  "dup     v20.4s,  v31.s[0]\n"             \
+  "dup     v21.4s,  v31.s[0]\n"             \
+  "dup     v22.4s,  v31.s[0]\n"             \
+  "dup     v23.4s,  v31.s[0]\n"             \
+  "dup     v24.4s,  v31.s[1]\n"             \
+  "dup     v25.4s,  v31.s[1]\n"             \
+  "dup     v26.4s,  v31.s[1]\n"             \
+  "dup     v27.4s,  v31.s[1]\n"             \
+  "fmla    v16.4s,  v0.4s,  v30.s[0]\n"     \
+  "fmla    v17.4s,  v1.4s,  v30.s[0]\n"     \
+  "fmla    v18.4s,  v2.4s,  v30.s[0]\n"     \
+  "fmla    v19.4s,  v3.4s,  v30.s[0]\n"     \
+  "fmla    v20.4s,  v4.4s,  v30.s[0]\n"     \
+  "fmla    v21.4s,  v5.4s,  v30.s[0]\n"     \
+  "fmla    v22.4s,  v6.4s,  v30.s[0]\n"     \
+  "fmla    v23.4s,  v7.4s,  v30.s[0]\n"     \
+  "dup     v0.4s,   v30.s[1]\n"             \
+  "dup     v28.4s,  v24.s[0]\n"             \
+  "dup     v29.4s,  v24.s[0]\n"             \
+  "dup     v30.4s,  v24.s[0]\n"             \
+  "dup     v31.4s,  v24.s[0]\n"             \
+  "fmla    v24.4s,  v8.4s,  v0.s[0]\n"      \
+  "fmla    v25.4s,  v9.4s,  v0.s[0]\n"      \
+  "fmla    v26.4s,  v10.4s, v0.s[0]\n"      \
+  "fmla    v27.4s,  v11.4s, v0.s[0]\n"      \
+  "fmla    v28.4s,  v12.4s, v0.s[0]\n"      \
+  "fmla    v29.4s,  v13.4s, v0.s[0]\n"      \
+  "fmla    v30.4s,  v14.4s, v0.s[0]\n"      \
+  "fmla    v31.4s,  v15.4s, v0.s[0]\n"
+
+#define SPARSE_INT8_F32_W16_V8_SEMI2_KERNEL \
+  "eor v8.16b,  v1.16b, v1.16b\n"           \
+  "eor v9.16b,  v2.16b, v2.16b\n"           \
+  "eor v10.16b, v3.16b, v3.16b\n"           \
+  "eor v11.16b, v4.16b, v4.16b\n"           \
+  "ld1     {v0.2s},   [%[bias_ptr]]\n"      \
+  "eor v12.16b, v5.16b, v5.16b\n"           \
+  "eor v13.16b, v6.16b, v6.16b\n"           \
+  "eor v14.16b, v7.16b, v7.16b\n"           \
+  "eor v15.16b, v8.16b, v8.16b\n"           \
+  "dup     v24.4s,  v0.s[0]\n"              \
+  "dup     v25.4s,  v0.s[0]\n"              \
+  "prfm  pldl1keep, [%[a_ptr], #64]\n"      \
+  "dup     v26.4s,  v0.s[0]\n"              \
+  "dup     v27.4s,  v0.s[0]\n"              \
+  "prfm  pldl1keep, [%[widx_dmap], #64]\n"  \
+  "dup     v28.4s,  v0.s[1]\n"              \
+  "dup     v29.4s,  v0.s[1]\n"              \
+  "prfm  pldl1keep, [%[b_ptr], #64]\n"      \
+  "dup     v30.4s,  v0.s[1]\n"              \
+  "dup     v31.4s,  v0.s[1]\n"              \
+  "ld1     {v0.2s},   [%[scale_ptr]]\n"     \
+  "cbz    %w[k],    1f\n"                   \
+  "0:\n"                                    \
+  "ld1r  {v1.16b}, [%[a_ptr]], #1\n"        \
+  "ldr   w1, [%[widx_dmap]],   #4\n"        \
+  "ld1r  {v2.16b}, [%[a_ptr]], #1\n"        \
+  "sxtw  x1,  w1\n"                         \
+  "ld1   {v3.16b}, [%[b_ptr]]\n"            \
+  "subs    %w[k],   %w[k],   #1\n"          \
+  "smull   v4.8h,   v1.8b,   v3.8b\n"       \
+  "smull2  v5.8h,   v1.16b,  v3.16b\n"      \
+  "smull   v6.8h,   v2.8b,   v3.8b\n"       \
+  "smull2  v7.8h,   v2.16b,  v3.16b\n"      \
+  "add   %[b_ptr],  %[b_ptr], x1\n"         \
+  "saddw   v8.4s,   v8.4s,   v4.4h\n"       \
+  "saddw2  v9.4s,   v9.4s,   v4.8h\n"       \
+  "prfm  pldl1keep, [%[b_ptr], #64]\n"      \
+  "saddw   v10.4s,  v10.4s,  v5.4h\n"       \
+  "saddw2  v11.4s,  v11.4s,  v5.8h\n"       \
+  "saddw   v12.4s,  v12.4s,  v6.4h\n"       \
+  "prfm  pldl1keep, [%[a_ptr], #64]\n"      \
+  "saddw2  v13.4s,  v13.4s,  v6.8h\n"       \
+  "saddw   v14.4s,  v14.4s,  v7.4h\n"       \
+  "saddw2  v15.4s,  v15.4s,  v7.8h\n"       \
+  "bne     0b\n"                            \
+  "1:\n"                                    \
+  "scvtf   v1.4s,   v8.4s\n"                \
+  "scvtf   v2.4s,   v9.4s\n"                \
+  "scvtf   v3.4s,   v10.4s\n"               \
+  "scvtf   v4.4s,   v11.4s\n"               \
+  "scvtf   v5.4s,   v12.4s\n"               \
+  "scvtf   v6.4s,   v13.4s\n"               \
+  "scvtf   v7.4s,   v14.4s\n"               \
+  "scvtf   v8.4s,   v15.4s\n"               \
+  "fmla    v24.4s,  v1.4s,  v0.s[0]\n"      \
+  "fmla    v25.4s,  v2.4s,  v0.s[0]\n"      \
+  "fmla    v26.4s,  v3.4s,  v0.s[0]\n"      \
+  "fmla    v27.4s,  v4.4s,  v0.s[0]\n"      \
+  "fmla    v28.4s,  v5.4s,  v0.s[1]\n"      \
+  "fmla    v29.4s,  v6.4s,  v0.s[1]\n"      \
+  "fmla    v30.4s,  v7.4s,  v0.s[1]\n"      \
+  "fmla    v31.4s,  v8.4s,  v0.s[1]\n"
+
+#define SPARSE_INT8_F32_W8_V8_SEMI2_KERNEL \
+  "eor v8.16b,  v1.16b, v1.16b\n"          \
+  "eor v9.16b,  v2.16b, v2.16b\n"          \
+  "eor v10.16b, v3.16b, v3.16b\n"          \
+  "eor v11.16b, v4.16b, v4.16b\n"          \
+  "ld1     {v0.2s},   [%[bias_ptr]]\n"     \
+  "dup     v28.4s,  v0.s[0]\n"             \
+  "dup     v29.4s,  v0.s[0]\n"             \
+  "prfm  pldl1keep, [%[b_ptr], #64]\n"     \
+  "dup     v30.4s,  v0.s[1]\n"             \
+  "dup     v31.4s,  v0.s[1]\n"             \
+  "ld1     {v0.2s},   [%[scale_ptr]]\n"    \
+  "cbz    %w[k],    1f\n"                  \
+  "0:\n"                                   \
+  "ld1r  {v1.16b}, [%[a_ptr]], #1\n"       \
+  "ldr   w1, [%[widx_dmap]],   #4\n"       \
+  "ld1r  {v2.16b}, [%[a_ptr]], #1\n"       \
+  "sxtw  x1,  w1\n"                        \
+  "ld1   {v3.8b}, [%[b_ptr]]\n"            \
+  "subs    %w[k],   %w[k],   #1\n"         \
+  "smull   v4.8h,   v1.8b,   v3.8b\n"      \
+  "smull   v5.8h,   v2.8b,   v3.8b\n"      \
+  "add   %[b_ptr],  %[b_ptr], x1\n"        \
+  "saddw   v8.4s,   v8.4s,   v4.4h\n"      \
+  "saddw2  v9.4s,   v9.4s,   v4.8h\n"      \
+  "prfm  pldl1keep, [%[b_ptr], #64]\n"     \
+  "saddw   v10.4s,  v10.4s,  v5.4h\n"      \
+  "saddw2  v11.4s,  v11.4s,  v5.8h\n"      \
+  "bne     0b\n"                           \
+  "1:\n"                                   \
+  "scvtf   v1.4s,   v8.4s\n"               \
+  "scvtf   v2.4s,   v9.4s\n"               \
+  "scvtf   v3.4s,   v10.4s\n"              \
+  "scvtf   v4.4s,   v11.4s\n"              \
+  "fmla    v28.4s,  v1.4s,  v0.s[0]\n"     \
+  "fmla    v29.4s,  v2.4s,  v0.s[0]\n"     \
+  "fmla    v30.4s,  v3.4s,  v0.s[1]\n"     \
+  "fmla    v31.4s,  v4.4s,  v0.s[1]\n"
+
+#define SPARSE_INT8_F32_W4_V8_SEMI2_KERNEL \
+  "eor v8.16b,  v1.16b, v1.16b\n"          \
+  "eor v9.16b,  v2.16b, v2.16b\n"          \
+  "ld1     {v0.2s},   [%[bias_ptr]]\n"     \
+  "dup     v30.4s,  v0.s[0]\n"             \
+  "dup     v31.4s,  v0.s[1]\n"             \
+  "ld1     {v0.2s},   [%[scale_ptr]]\n"    \
+  "cbz    %w[k],    1f\n"                  \
+  "0:\n"                                   \
+  "ld1r  {v1.16b}, [%[a_ptr]], #1\n"       \
+  "ldr   w1, [%[widx_dmap]],   #4\n"       \
+  "ld1r  {v2.16b}, [%[a_ptr]], #1\n"       \
+  "ldrsb   w2, [%[b_ptr]]\n"               \
+  "ldrsb   w3, [%[b_ptr], #1]\n"           \
+  "subs    %w[k],   %w[k],   #1\n"         \
+  "ldrsb   w4, [%[b_ptr], #2]\n"           \
+  "ldrsb   w5, [%[b_ptr], #3]\n"           \
+  "sxtw  x1,  w1\n"                        \
+  "mov   v3.b[0], w2\n"                    \
+  "mov   v3.b[1], w3\n"                    \
+  "mov   v3.b[2], w4\n"                    \
+  "mov   v3.b[3], w5\n"                    \
+  "smull   v4.8h,   v1.8b,   v3.8b\n"      \
+  "smull   v5.8h,   v2.8b,   v3.8b\n"      \
+  "add   %[b_ptr],  %[b_ptr], x1\n"        \
+  "saddw   v8.4s,   v8.4s,   v4.4h\n"      \
+  "saddw   v9.4s,   v9.4s,   v5.4h\n"      \
+  "bne     0b\n"                           \
+  "1:\n"                                   \
+  "scvtf   v1.4s,   v8.4s\n"               \
+  "scvtf   v2.4s,   v9.4s\n"               \
+  "fmla    v30.4s,  v1.4s,  v0.s[0]\n"     \
+  "fmla    v31.4s,  v2.4s,  v0.s[1]\n"
+
+#define SPARSE_INT8_F32_W32_V8_SEMI1_KERNEL \
+  "eor v11.16b, v0.16b, v0.16b\n"           \
+  "eor v12.16b, v1.16b, v1.16b\n"           \
+  "prfm  pldl1keep, [%[a_ptr], #32]\n"      \
+  "eor v13.16b, v2.16b, v2.16b\n"           \
+  "eor v14.16b, v3.16b, v3.16b\n"           \
+  "prfm  pldl1keep, [%[widx_dmap], #32]\n"  \
+  "eor v15.16b, v4.16b, v4.16b\n"           \
+  "eor v16.16b, v5.16b, v5.16b\n"           \
+  "prfm  pldl1keep, [%[b_ptr], #32]\n"      \
+  "eor v17.16b, v6.16b, v6.16b\n"           \
+  "eor v18.16b, v7.16b, v7.16b\n"           \
+  "dup     v21.4s,  %w[vbias]\n"            \
+  "dup     v22.4s,  v21.s[0]\n"             \
+  "dup     v23.4s,  v21.s[0]\n"             \
+  "dup     v24.4s,  v21.s[0]\n"             \
+  "dup     v25.4s,  v21.s[0]\n"             \
+  "dup     v26.4s,  v21.s[0]\n"             \
+  "dup     v27.4s,  v21.s[0]\n"             \
+  "dup     v28.4s,  v21.s[0]\n"             \
+  "cbz    %w[k],    1f\n"                   \
+  "0:\n"                                    \
+  "ld1r  {v0.16b}, [%[a_ptr]], #1\n"        \
+  "ldr   w1, [%[widx_dmap]],   #4\n"        \
+  "sxtw  x1,  w1\n"                         \
+  "ld1   {v1.16b, v2.16b}, [%[b_ptr]]\n"    \
+  "add   %[b_ptr],  %[b_ptr], x1\n"         \
+  "smull   v3.8h,   v0.8b,   v1.8b\n"       \
+  "smull   v5.8h,   v0.8b,   v2.8b\n"       \
+  "subs    %w[k],   %w[k],   #1\n"          \
+  "smull2  v4.8h,   v0.16b,  v1.16b\n"      \
+  "smull2  v6.8h,   v0.16b,  v2.16b\n"      \
+  "saddw   v11.4s,  v11.4s,  v3.4h\n"       \
+  "saddw   v13.4s,  v13.4s,  v4.4h\n"       \
+  "prfm  pldl1keep, [%[b_ptr], #32]\n"      \
+  "saddw   v15.4s,  v15.4s,  v5.4h\n"       \
+  "saddw   v17.4s,  v17.4s,  v6.4h\n"       \
+  "saddw2  v12.4s,  v12.4s,  v3.8h\n"       \
+  "saddw2  v14.4s,  v14.4s,  v4.8h\n"       \
+  "saddw2  v16.4s,  v16.4s,  v5.8h\n"       \
+  "saddw2  v18.4s,  v18.4s,  v6.8h\n"       \
+  "bne     0b\n"                            \
+  "1:\n"                                    \
+  "scvtf   v3.4s,  v11.4s\n"                \
+  "scvtf   v4.4s,  v12.4s\n"                \
+  "scvtf   v5.4s,  v13.4s\n"                \
+  "scvtf   v6.4s,  v14.4s\n"                \
+  "scvtf   v7.4s,  v15.4s\n"                \
+  "scvtf   v8.4s,  v16.4s\n"                \
+  "scvtf   v9.4s,  v17.4s\n"                \
+  "scvtf   v10.4s, v18.4s\n" /* scale */    \
+  "dup     v31.4s,  %w[vscale]\n"           \
+  "fmla    v21.4s,  v3.4s,  v31.s[0]\n"     \
+  "fmla    v22.4s,  v4.4s,  v31.s[0]\n"     \
+  "fmla    v23.4s,  v5.4s,  v31.s[0]\n"     \
+  "fmla    v24.4s,  v6.4s,  v31.s[0]\n"     \
+  "fmla    v25.4s,  v7.4s,  v31.s[0]\n"     \
+  "fmla    v26.4s,  v8.4s,  v31.s[0]\n"     \
+  "fmla    v27.4s,  v9.4s,  v31.s[0]\n"     \
+  "fmla    v28.4s,  v10.4s, v31.s[0]\n"
+
+#define SPARSE_INT8_F32_W16_V8_SEMI1_KERNEL \
+  "eor v11.16b, v0.16b, v0.16b\n"           \
+  "eor v12.16b, v1.16b, v1.16b\n"           \
+  "eor v13.16b, v2.16b, v2.16b\n"           \
+  "prfm  pldl1keep, [%[b_ptr], #16]\n"      \
+  "eor v14.16b, v3.16b, v3.16b\n"           \
+  "dup     v21.4s,  %w[vbias]\n"            \
+  "dup     v22.4s,  v21.s[0]\n"             \
+  "dup     v23.4s,  v21.s[0]\n"             \
+  "dup     v24.4s,  v21.s[0]\n"             \
+  "cbz    %w[k],    1f\n"                   \
+  "0:\n"                                    \
+  "ld1r  {v0.16b}, [%[a_ptr]], #1\n"        \
+  "ldr   w1, [%[widx_dmap]],   #4\n"        \
+  "sxtw  x1,  w1\n"                         \
+  "ld1   {v1.16b}, [%[b_ptr]]\n"            \
+  "add   %[b_ptr],  %[b_ptr], x1\n"         \
+  "smull   v3.8h,   v0.8b,   v1.8b\n"       \
+  "smull2  v4.8h,   v0.16b,  v1.16b\n"      \
+  "prfm  pldl1keep, [%[b_ptr], #16]\n"      \
+  "subs    %w[k],   %w[k],   #1\n"          \
+  "saddw   v11.4s,  v11.4s,  v3.4h\n"       \
+  "saddw2  v12.4s,  v12.4s,  v3.8h\n"       \
+  "saddw   v13.4s,  v13.4s,  v4.4h\n"       \
+  "saddw2  v14.4s,  v14.4s,  v4.8h\n"       \
+  "bne     0b\n"                            \
+  "1:\n"                                    \
+  "scvtf   v5.4s,  v11.4s\n"                \
+  "scvtf   v6.4s,  v12.4s\n"                \
+  "scvtf   v7.4s,  v13.4s\n"                \
+  "scvtf   v8.4s,  v14.4s\n" /* scale */    \
+  "dup     v2.4s,  %w[vscale]\n"            \
+  "fmla    v21.4s,  v5.4s,  v2.s[0]\n"      \
+  "fmla    v22.4s,  v6.4s,  v2.s[0]\n"      \
+  "fmla    v23.4s,  v7.4s,  v2.s[0]\n"      \
+  "fmla    v24.4s,  v8.4s,  v2.s[0]\n"
+
+#define SPARSE_INT8_F32_W8_V8_SEMI1_KERNEL \
+  "eor v11.16b, v0.16b, v0.16b\n"          \
+  "eor v12.16b, v1.16b, v1.16b\n"          \
+  "dup     v21.4s,  %w[vbias]\n"           \
+  "dup     v22.4s,  v21.s[0]\n"            \
+  "cbz    %w[k],    1f\n"                  \
+  "0:\n"                                   \
+  "ld1r  {v0.8b}, [%[a_ptr]], #1\n"        \
+  "ldr   w1, [%[widx_dmap]],   #4\n"       \
+  "sxtw  x1,  w1\n"                        \
+  "ld1   {v1.8b}, [%[b_ptr]]\n"            \
+  "add   %[b_ptr],  %[b_ptr], x1\n"        \
+  "smull   v3.8h,   v0.8b,   v1.8b\n"      \
+  "subs    %w[k],   %w[k],   #1\n"         \
+  "saddw   v11.4s,  v11.4s,  v3.4h\n"      \
+  "saddw2  v12.4s,  v12.4s,  v3.8h\n"      \
+  "bne     0b\n"                           \
+  "1:\n"                                   \
+  "scvtf   v4.4s,  v11.4s\n"               \
+  "scvtf   v5.4s,  v12.4s\n" /* scale */   \
+  "dup     v2.4s,   %w[vscale]\n"          \
+  "fmla    v21.4s,  v4.4s,  v2.s[0]\n"     \
+  "fmla    v22.4s,  v5.4s,  v2.s[0]\n"
+
+#define SPARSE_INT8_F32_W4_V8_SEMI1_KERNEL \
+  "eor v11.16b, v0.16b, v0.16b\n"          \
+  "dup     v21.4s,  %w[vbias]\n"           \
+  "cbz    %w[k],    1f\n"                  \
+  "0:\n"                                   \
+  "ld1r  {v0.8b}, [%[a_ptr]], #1\n"        \
+  "ldr   w1, [%[widx_dmap]],   #4\n"       \
+  "ldrsb   w2, [%[b_ptr]]\n"               \
+  "ldrsb   w3, [%[b_ptr], #1]\n"           \
+  "ldrsb   w4, [%[b_ptr], #2]\n"           \
+  "ldrsb   w5, [%[b_ptr], #3]\n"           \
+  "sxtw  x1,  w1\n"                        \
+  "mov   v1.b[0], w2\n"                    \
+  "mov   v1.b[1], w3\n"                    \
+  "mov   v1.b[2], w4\n"                    \
+  "mov   v1.b[3], w5\n"                    \
+  "add   %[b_ptr],  %[b_ptr], x1\n"        \
+  "smull   v3.8h,   v0.8b,   v1.8b\n"      \
+  "subs    %w[k],   %w[k],   #1\n"         \
+  "saddw   v11.4s,  v11.4s,  v3.4h\n"      \
+  "bne     0b\n"                           \
+  "1:\n"                                   \
+  "scvtf   v4.4s,  v11.4s\n" /* scale */   \
+  "dup     v2.4s,   %w[vscale]\n"          \
+  "fmla    v21.4s,  v4.4s,  v2.s[0]\n"
+
+#define SPARSE_INT8_F32_W48_V8_SEMI_RELU              \
+  /* do relu */                                       \
+  "cmp    %w[vflag_act],    #0\n"    /* skip relu */  \
+  "beq   9f                     \n"  /* no act end */ \
+  "cmp    %w[vflag_act],    #1\n"    /* skip relu */  \
+  "bne   10f                     \n" /* other act */  \
+  "movi   v0.4s, #0\n"               /* for relu */   \
+  "fmax   v8.4s,  v8.4s,  v0.4s\n"   /* relu */       \
+  "fmax   v9.4s,  v9.4s,  v0.4s\n"   /* relu */       \
+  "fmax   v10.4s, v10.4s, v0.4s\n"   /* relu */       \
+  "fmax   v11.4s, v11.4s, v0.4s\n"   /* relu */       \
+  "fmax   v12.4s, v12.4s, v0.4s\n"   /* relu */       \
+  "fmax   v13.4s, v13.4s, v0.4s\n"   /* relu */       \
+  "fmax   v14.4s, v14.4s, v0.4s\n"   /* relu */       \
+  "fmax   v15.4s, v15.4s, v0.4s\n"   /* relu */       \
+  "fmax   v16.4s, v16.4s, v0.4s\n"   /* relu */       \
+  "fmax   v17.4s, v17.4s, v0.4s\n"   /* relu */       \
+  "fmax   v18.4s, v18.4s, v0.4s\n"   /* relu */       \
+  "fmax   v19.4s, v19.4s, v0.4s\n"   /* relu */       \
+  "fmax   v20.4s, v20.4s, v0.4s\n"   /* relu */       \
+  "fmax   v21.4s, v21.4s, v0.4s\n"   /* relu */       \
+  "fmax   v22.4s, v22.4s, v0.4s\n"   /* relu */       \
+  "fmax   v23.4s, v23.4s, v0.4s\n"   /* relu */       \
+  "fmax   v24.4s, v24.4s, v0.4s\n"   /* relu */       \
+  "fmax   v25.4s, v25.4s, v0.4s\n"   /* relu */       \
+  "fmax   v26.4s, v26.4s, v0.4s\n"   /* relu */       \
+  "fmax   v27.4s, v27.4s, v0.4s\n"   /* relu */       \
+  "fmax   v28.4s, v28.4s, v0.4s\n"   /* relu */       \
+  "fmax   v29.4s, v29.4s, v0.4s\n"   /* relu */       \
+  "fmax   v30.4s, v30.4s, v0.4s\n"   /* relu */       \
+  "fmax   v31.4s, v31.4s, v0.4s\n"   /* relu */       \
+  "b      9f                    \n"  /* relu end */
+
+#define SPARSE_INT8_F32_W32_V8_SEMI2_RELU             \
+  /* do relu */                                       \
+  "cmp    %w[vflag_act],    #0\n"    /* skip relu */  \
+  "beq   9f                     \n"  /* no act end */ \
+  "cmp    %w[vflag_act],    #1\n"    /* skip relu */  \
+  "bne   10f                     \n" /* other act */  \
+  "movi   v0.4s, #0\n"               /* for relu */   \
+  "fmax   v16.4s, v16.4s, v0.4s\n"   /* relu */       \
+  "fmax   v17.4s, v17.4s, v0.4s\n"   /* relu */       \
+  "fmax   v18.4s, v18.4s, v0.4s\n"   /* relu */       \
+  "fmax   v19.4s, v19.4s, v0.4s\n"   /* relu */       \
+  "fmax   v20.4s, v20.4s, v0.4s\n"   /* relu */       \
+  "fmax   v21.4s, v21.4s, v0.4s\n"   /* relu */       \
+  "fmax   v22.4s, v22.4s, v0.4s\n"   /* relu */       \
+  "fmax   v23.4s, v23.4s, v0.4s\n"   /* relu */       \
+  "fmax   v24.4s, v24.4s, v0.4s\n"   /* relu */       \
+  "fmax   v25.4s, v25.4s, v0.4s\n"   /* relu */       \
+  "fmax   v26.4s, v26.4s, v0.4s\n"   /* relu */       \
+  "fmax   v27.4s, v27.4s, v0.4s\n"   /* relu */       \
+  "fmax   v28.4s, v28.4s, v0.4s\n"   /* relu */       \
+  "fmax   v29.4s, v29.4s, v0.4s\n"   /* relu */       \
+  "fmax   v30.4s, v30.4s, v0.4s\n"   /* relu */       \
+  "fmax   v31.4s, v31.4s, v0.4s\n"   /* relu */       \
+  "b      9f                    \n"  /* relu end */
+
+#define SPARSE_INT8_F32_W16_V8_SEMI2_RELU             \
+  /* do relu */                                       \
+  "cmp    %w[vflag_act],    #0\n"    /* skip relu */  \
+  "beq   9f                     \n"  /* no act end */ \
+  "cmp    %w[vflag_act],    #1\n"    /* skip relu */  \
+  "bne   10f                     \n" /* other act */  \
+  "movi   v0.4s, #0\n"               /* for relu */   \
+  "fmax   v24.4s, v24.4s, v0.4s\n"   /* relu */       \
+  "fmax   v25.4s, v25.4s, v0.4s\n"   /* relu */       \
+  "fmax   v26.4s, v26.4s, v0.4s\n"   /* relu */       \
+  "fmax   v27.4s, v27.4s, v0.4s\n"   /* relu */       \
+  "fmax   v28.4s, v28.4s, v0.4s\n"   /* relu */       \
+  "fmax   v29.4s, v29.4s, v0.4s\n"   /* relu */       \
+  "fmax   v30.4s, v30.4s, v0.4s\n"   /* relu */       \
+  "fmax   v31.4s, v31.4s, v0.4s\n"   /* relu */       \
+  "b      9f                    \n"  /* relu end */
+
+#define SPARSE_INT8_F32_W8_V8_SEMI2_RELU              \
+  /* do relu */                                       \
+  "cmp    %w[vflag_act],    #0\n"    /* skip relu */  \
+  "beq   9f                     \n"  /* no act end */ \
+  "cmp    %w[vflag_act],    #1\n"    /* skip relu */  \
+  "bne   10f                     \n" /* other act */  \
+  "movi   v0.4s, #0\n"               /* for relu */   \
+  "fmax   v28.4s, v28.4s, v0.4s\n"   /* relu */       \
+  "fmax   v29.4s, v29.4s, v0.4s\n"   /* relu */       \
+  "fmax   v30.4s, v30.4s, v0.4s\n"   /* relu */       \
+  "fmax   v31.4s, v31.4s, v0.4s\n"   /* relu */       \
+  "b      9f                    \n"  /* relu end */
+
+#define SPARSE_INT8_F32_W4_V8_SEMI2_RELU              \
+  /* do relu */                                       \
+  "cmp    %w[vflag_act],    #0\n"    /* skip relu */  \
+  "beq   9f                     \n"  /* no act end */ \
+  "cmp    %w[vflag_act],    #1\n"    /* skip relu */  \
+  "bne   10f                     \n" /* other act */  \
+  "movi   v0.4s, #0\n"               /* for relu */   \
+  "fmax   v30.4s, v30.4s, v0.4s\n"   /* relu */       \
+  "fmax   v31.4s, v31.4s, v0.4s\n"   /* relu */       \
+  "b      9f                    \n"  /* relu end */
+
+#define SPARSE_INT8_F32_W32_V8_SEMI1_RELU             \
+  /* do relu */                                       \
+  "cmp    %w[vflag_act],    #0\n"    /* skip relu */  \
+  "beq   9f                     \n"  /* no act end */ \
+  "cmp    %w[vflag_act],    #1\n"    /* skip relu */  \
+  "bne   10f                     \n" /* other act */  \
+  "movi   v30.4s, #0\n"              /* for relu */   \
+  "fmax   v21.4s, v21.4s, v30.4s\n"  /* relu */       \
+  "fmax   v22.4s, v22.4s, v30.4s\n"  /* relu */       \
+  "fmax   v23.4s, v23.4s, v30.4s\n"  /* relu */       \
+  "fmax   v24.4s, v24.4s, v30.4s\n"  /* relu */       \
+  "fmax   v25.4s, v25.4s, v30.4s\n"  /* relu */       \
+  "fmax   v26.4s, v26.4s, v30.4s\n"  /* relu */       \
+  "fmax   v27.4s, v27.4s, v30.4s\n"  /* relu */       \
+  "fmax   v28.4s, v28.4s, v30.4s\n"  /* relu */       \
+  "b      9f                    \n"  /* relu end */
+
+#define SPARSE_INT8_F32_W16_V8_SEMI1_RELU             \
+  /* do relu */                                       \
+  "cmp    %w[vflag_act],    #0\n"    /* skip relu */  \
+  "beq   9f                     \n"  /* no act end */ \
+  "cmp    %w[vflag_act],    #1\n"    /* skip relu */  \
+  "bne   10f                     \n" /* other act */  \
+  "movi   v9.4s, #0\n"               /* for relu */   \
+  "fmax   v21.4s, v21.4s, v9.4s\n"   /* relu */       \
+  "fmax   v22.4s, v22.4s, v9.4s\n"   /* relu */       \
+  "fmax   v23.4s, v23.4s, v9.4s\n"   /* relu */       \
+  "fmax   v24.4s, v24.4s, v9.4s\n"   /* relu */       \
+  "b      9f                    \n"  /* relu end */
+
+#define SPARSE_INT8_F32_W8_V8_SEMI1_RELU              \
+  /* do relu */                                       \
+  "cmp    %w[vflag_act],    #0\n"    /* skip relu */  \
+  "beq   9f                     \n"  /* no act end */ \
+  "cmp    %w[vflag_act],    #1\n"    /* skip relu */  \
+  "bne   10f                     \n" /* other act */  \
+  "movi   v9.4s, #0\n"               /* for relu */   \
+  "fmax   v21.4s, v21.4s, v9.4s\n"   /* relu */       \
+  "fmax   v22.4s, v22.4s, v9.4s\n"   /* relu */       \
+  "b      9f                    \n"  /* relu end */
+
+#define SPARSE_INT8_F32_W4_V8_SEMI1_RELU              \
+  /* do relu */                                       \
+  "cmp    %w[vflag_act],    #0\n"    /* skip relu */  \
+  "beq   9f                     \n"  /* no act end */ \
+  "cmp    %w[vflag_act],    #1\n"    /* skip relu */  \
+  "bne   10f                     \n" /* other act */  \
+  "movi   v9.4s, #0\n"               /* for relu */   \
+  "fmax   v21.4s, v21.4s, v9.4s\n"   /* relu */       \
+  "b      9f                    \n"  /* relu end */
+
+#define SPARSE_INT8_F32_W48_V8_SEMI_RELU6               \
+  /* do relu6 */                                        \
+  "10: \n"                                              \
+  "cmp   %w[vflag_act],  #2       \n" /* check relu6 */ \
+  "bne   11f                     \n"  /* no act end */  \
+  "movi   v0.4s, #0\n"                /* for relu6 */   \
+  "dup    v1.4s,  %w[valpha]\n"       /* relu6 alpha */ \
+  "fmax   v8.4s,  v8.4s,  v0.4s\n"    /* relu */        \
+  "fmax   v9.4s,  v9.4s,  v0.4s\n"    /* relu */        \
+  "fmax   v10.4s, v10.4s, v0.4s\n"    /* relu */        \
+  "fmax   v11.4s, v11.4s, v0.4s\n"    /* relu */        \
+  "fmax   v12.4s, v12.4s, v0.4s\n"    /* relu */        \
+  "fmax   v13.4s, v13.4s, v0.4s\n"    /* relu */        \
+  "fmax   v14.4s, v14.4s, v0.4s\n"    /* relu */        \
+  "fmax   v15.4s, v15.4s, v0.4s\n"    /* relu */        \
+  "fmax   v16.4s, v16.4s, v0.4s\n"    /* relu */        \
+  "fmax   v17.4s, v17.4s, v0.4s\n"    /* relu */        \
+  "fmax   v18.4s, v18.4s, v0.4s\n"    /* relu */        \
+  "fmax   v19.4s, v19.4s, v0.4s\n"    /* relu */        \
+  "fmax   v20.4s, v20.4s, v0.4s\n"    /* relu */        \
+  "fmax   v21.4s, v21.4s, v0.4s\n"    /* relu */        \
+  "fmax   v22.4s, v22.4s, v0.4s\n"    /* relu */        \
+  "fmax   v23.4s, v23.4s, v0.4s\n"    /* relu */        \
+  "fmax   v24.4s, v24.4s, v0.4s\n"    /* relu */        \
+  "fmax   v25.4s, v25.4s, v0.4s\n"    /* relu */        \
+  "fmax   v26.4s, v26.4s, v0.4s\n"    /* relu */        \
+  "fmax   v27.4s, v27.4s, v0.4s\n"    /* relu */        \
+  "fmax   v28.4s, v28.4s, v0.4s\n"    /* relu */        \
+  "fmax   v29.4s, v29.4s, v0.4s\n"    /* relu */        \
+  "fmax   v30.4s, v30.4s, v0.4s\n"    /* relu */        \
+  "fmax   v31.4s, v31.4s, v0.4s\n"    /* relu */        \
+  "fmin   v8.4s,  v8.4s,  v1.4s\n"    /* relu */        \
+  "fmin   v9.4s,  v9.4s,  v1.4s\n"    /* relu */        \
+  "fmin   v10.4s, v10.4s, v1.4s\n"    /* relu */        \
+  "fmin   v11.4s, v11.4s, v1.4s\n"    /* relu */        \
+  "fmin   v12.4s, v12.4s, v1.4s\n"    /* relu */        \
+  "fmin   v13.4s, v13.4s, v1.4s\n"    /* relu */        \
+  "fmin   v14.4s, v14.4s, v1.4s\n"    /* relu */        \
+  "fmin   v15.4s, v15.4s, v1.4s\n"    /* relu */        \
+  "fmin   v16.4s, v16.4s, v1.4s\n"    /* relu */        \
+  "fmin   v17.4s, v17.4s, v1.4s\n"    /* relu */        \
+  "fmin   v18.4s, v18.4s, v1.4s\n"    /* relu */        \
+  "fmin   v19.4s, v19.4s, v1.4s\n"    /* relu */        \
+  "fmin   v20.4s, v20.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v21.4s, v21.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v22.4s, v22.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v23.4s, v23.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v24.4s, v24.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v25.4s, v25.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v26.4s, v26.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v27.4s, v27.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v28.4s, v28.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v29.4s, v29.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v30.4s, v30.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v31.4s, v31.4s, v1.4s\n"    /* relu6 */       \
+  "b      9f                    \n"   /* relu end */
+
+#define SPARSE_INT8_F32_W32_V8_SEMI2_RELU6              \
+  /* do relu6 */                                        \
+  "10: \n"                                              \
+  "cmp   %w[vflag_act],  #2       \n" /* check relu6 */ \
+  "bne   11f                     \n"  /* no act end */  \
+  "movi   v0.4s, #0\n"                /* for relu6 */   \
+  "dup    v1.4s,  %w[valpha]\n"       /* relu6 alpha */ \
+  "fmax   v16.4s, v16.4s, v0.4s\n"    /* relu */        \
+  "fmax   v17.4s, v17.4s, v0.4s\n"    /* relu */        \
+  "fmax   v18.4s, v18.4s, v0.4s\n"    /* relu */        \
+  "fmax   v19.4s, v19.4s, v0.4s\n"    /* relu */        \
+  "fmax   v20.4s, v20.4s, v0.4s\n"    /* relu */        \
+  "fmax   v21.4s, v21.4s, v0.4s\n"    /* relu */        \
+  "fmax   v22.4s, v22.4s, v0.4s\n"    /* relu */        \
+  "fmax   v23.4s, v23.4s, v0.4s\n"    /* relu */        \
+  "fmax   v24.4s, v24.4s, v0.4s\n"    /* relu */        \
+  "fmax   v25.4s, v25.4s, v0.4s\n"    /* relu */        \
+  "fmax   v26.4s, v26.4s, v0.4s\n"    /* relu */        \
+  "fmax   v27.4s, v27.4s, v0.4s\n"    /* relu */        \
+  "fmax   v28.4s, v28.4s, v0.4s\n"    /* relu */        \
+  "fmax   v29.4s, v29.4s, v0.4s\n"    /* relu */        \
+  "fmax   v30.4s, v30.4s, v0.4s\n"    /* relu */        \
+  "fmax   v31.4s, v31.4s, v0.4s\n"    /* relu */        \
+  "fmin   v16.4s, v16.4s, v1.4s\n"    /* relu */        \
+  "fmin   v17.4s, v17.4s, v1.4s\n"    /* relu */        \
+  "fmin   v18.4s, v18.4s, v1.4s\n"    /* relu */        \
+  "fmin   v19.4s, v19.4s, v1.4s\n"    /* relu */        \
+  "fmin   v20.4s, v20.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v21.4s, v21.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v22.4s, v22.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v23.4s, v23.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v24.4s, v24.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v25.4s, v25.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v26.4s, v26.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v27.4s, v27.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v28.4s, v28.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v29.4s, v29.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v30.4s, v30.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v31.4s, v31.4s, v1.4s\n"    /* relu6 */       \
+  "b      9f                    \n"   /* relu end */
+
+#define SPARSE_INT8_F32_W16_V8_SEMI2_RELU6              \
+  /* do relu6 */                                        \
+  "10: \n"                                              \
+  "cmp   %w[vflag_act],  #2       \n" /* check relu6 */ \
+  "bne   11f                     \n"  /* no act end */  \
+  "movi   v0.4s, #0\n"                /* for relu6 */   \
+  "dup    v1.4s,  %w[valpha]\n"       /* relu6 alpha */ \
+  "fmax   v24.4s, v24.4s, v0.4s\n"    /* relu */        \
+  "fmax   v25.4s, v25.4s, v0.4s\n"    /* relu */        \
+  "fmax   v26.4s, v26.4s, v0.4s\n"    /* relu */        \
+  "fmax   v27.4s, v27.4s, v0.4s\n"    /* relu */        \
+  "fmax   v28.4s, v28.4s, v0.4s\n"    /* relu */        \
+  "fmax   v29.4s, v29.4s, v0.4s\n"    /* relu */        \
+  "fmax   v30.4s, v30.4s, v0.4s\n"    /* relu */        \
+  "fmax   v31.4s, v31.4s, v0.4s\n"    /* relu */        \
+  "fmin   v24.4s, v24.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v25.4s, v25.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v26.4s, v26.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v27.4s, v27.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v28.4s, v28.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v29.4s, v29.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v30.4s, v30.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v31.4s, v31.4s, v1.4s\n"    /* relu6 */       \
+  "b      9f                    \n"   /* relu end */
+
+#define SPARSE_INT8_F32_W8_V8_SEMI2_RELU6               \
+  /* do relu6 */                                        \
+  "10: \n"                                              \
+  "cmp   %w[vflag_act],  #2       \n" /* check relu6 */ \
+  "bne   11f                     \n"  /* no act end */  \
+  "movi   v0.4s, #0\n"                /* for relu6 */   \
+  "dup    v1.4s,  %w[valpha]\n"       /* relu6 alpha */ \
+  "fmax   v28.4s, v28.4s, v0.4s\n"    /* relu */        \
+  "fmax   v29.4s, v29.4s, v0.4s\n"    /* relu */        \
+  "fmax   v30.4s, v30.4s, v0.4s\n"    /* relu */        \
+  "fmax   v31.4s, v31.4s, v0.4s\n"    /* relu */        \
+  "fmin   v28.4s, v28.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v29.4s, v29.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v30.4s, v30.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v31.4s, v31.4s, v1.4s\n"    /* relu6 */       \
+  "b      9f                    \n"   /* relu end */
+
+#define SPARSE_INT8_F32_W4_V8_SEMI2_RELU6               \
+  /* do relu6 */                                        \
+  "10: \n"                                              \
+  "cmp   %w[vflag_act],  #2       \n" /* check relu6 */ \
+  "bne   11f                     \n"  /* no act end */  \
+  "movi   v0.4s, #0\n"                /* for relu6 */   \
+  "dup    v1.4s,  %w[valpha]\n"       /* relu6 alpha */ \
+  "fmax   v30.4s, v30.4s, v0.4s\n"    /* relu */        \
+  "fmax   v31.4s, v31.4s, v0.4s\n"    /* relu */        \
+  "fmin   v30.4s, v30.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v31.4s, v31.4s, v1.4s\n"    /* relu6 */       \
+  "b      9f                    \n"   /* relu end */
+
+#define SPARSE_INT8_F32_W32_V8_SEMI1_RELU6              \
+  /* do relu6 */                                        \
+  "10: \n"                                              \
+  "cmp   %w[vflag_act],  #2       \n" /* check relu6 */ \
+  "bne   11f                     \n"  /* no act end */  \
+  "movi   v0.4s, #0\n"                /* for relu6 */   \
+  "dup    v1.4s,  %w[valpha]\n"       /* relu6 alpha */ \
+  "fmax   v21.4s, v21.4s, v0.4s\n"    /* relu */        \
+  "fmax   v22.4s, v22.4s, v0.4s\n"    /* relu */        \
+  "fmax   v23.4s, v23.4s, v0.4s\n"    /* relu */        \
+  "fmax   v24.4s, v24.4s, v0.4s\n"    /* relu */        \
+  "fmax   v25.4s, v25.4s, v0.4s\n"    /* relu */        \
+  "fmax   v26.4s, v26.4s, v0.4s\n"    /* relu */        \
+  "fmax   v27.4s, v27.4s, v0.4s\n"    /* relu */        \
+  "fmax   v28.4s, v28.4s, v0.4s\n"    /* relu */        \
+  "fmin   v21.4s, v21.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v22.4s, v22.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v23.4s, v23.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v24.4s, v24.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v25.4s, v25.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v26.4s, v26.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v27.4s, v27.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v28.4s, v28.4s, v1.4s\n"    /* relu6 */       \
+  "b      9f                    \n"   /* relu end */
+
+#define SPARSE_INT8_F32_W16_V8_SEMI1_RELU6              \
+  /* do relu6 */                                        \
+  "10: \n"                                              \
+  "cmp   %w[vflag_act],  #2       \n" /* check relu6 */ \
+  "bne   11f                     \n"  /* no act end */  \
+  "movi   v0.4s, #0\n"                /* for relu6 */   \
+  "dup    v1.4s,  %w[valpha]\n"       /* relu6 alpha */ \
+  "fmax   v21.4s, v21.4s, v0.4s\n"    /* relu */        \
+  "fmax   v22.4s, v22.4s, v0.4s\n"    /* relu */        \
+  "fmax   v23.4s, v23.4s, v0.4s\n"    /* relu */        \
+  "fmax   v24.4s, v24.4s, v0.4s\n"    /* relu */        \
+  "fmin   v21.4s, v21.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v22.4s, v22.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v23.4s, v23.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v24.4s, v24.4s, v1.4s\n"    /* relu6 */       \
+  "b      9f                    \n"   /* relu end */
+
+#define SPARSE_INT8_F32_W8_V8_SEMI1_RELU6               \
+  /* do relu6 */                                        \
+  "10: \n"                                              \
+  "cmp   %w[vflag_act],  #2       \n" /* check relu6 */ \
+  "bne   11f                     \n"  /* no act end */  \
+  "movi   v0.4s, #0\n"                /* for relu6 */   \
+  "dup    v1.4s,  %w[valpha]\n"       /* relu6 alpha */ \
+  "fmax   v21.4s, v21.4s, v0.4s\n"    /* relu */        \
+  "fmax   v22.4s, v22.4s, v0.4s\n"    /* relu */        \
+  "fmin   v21.4s, v21.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v22.4s, v22.4s, v1.4s\n"    /* relu6 */       \
+  "b      9f                    \n"   /* relu end */
+
+#define SPARSE_INT8_F32_W4_V8_SEMI1_RELU6               \
+  /* do relu6 */                                        \
+  "10: \n"                                              \
+  "cmp   %w[vflag_act],  #2       \n" /* check relu6 */ \
+  "bne   11f                     \n"  /* no act end */  \
+  "movi   v0.4s, #0\n"                /* for relu6 */   \
+  "dup    v1.4s,  %w[valpha]\n"       /* relu6 alpha */ \
+  "fmax   v21.4s, v21.4s, v0.4s\n"    /* relu */        \
+  "fmin   v21.4s, v21.4s, v1.4s\n"    /* relu6 */       \
+  "b      9f                    \n"   /* relu end */
+
+#define SPARSE_INT8_F32_W48_V8_SEMI_LEAKY_RELU                     \
+  /* do relu */                                                    \
+  "11: \n"                                                         \
+  "movi   v0.4s, #0\n"                     /* for relu6 */         \
+  "dup    v1.4s,  %w[valpha]\n"            /* leakey relu alpha */ \
+  "fcmge  v2.4s,    v8.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v3.4s,    v8.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v4.4s,    v9.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v5.4s,    v9.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v6.4s,    v10.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v7.4s,    v10.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "bif    v8.16b,   v3.16b,   v2.16b  \n"  /* choose*/             \
+  "bif    v9.16b,   v5.16b,   v4.16b  \n"  /* choose*/             \
+  "bif    v10.16b,  v7.16b,   v6.16b  \n"  /* choose*/             \
+  "fcmge  v2.4s,    v11.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v3.4s,    v11.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v4.4s,    v12.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v5.4s,    v12.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v6.4s,    v13.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v7.4s,    v13.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "bif    v11.16b,   v3.16b,   v2.16b  \n" /* choose*/             \
+  "bif    v12.16b,   v5.16b,   v4.16b  \n" /* choose*/             \
+  "bif    v13.16b,  v7.16b,   v6.16b  \n"  /* choose*/             \
+  "fcmge  v2.4s,    v14.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v3.4s,    v14.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v4.4s,    v15.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v5.4s,    v15.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v6.4s,    v16.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v7.4s,    v16.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "bif    v14.16b,   v3.16b,   v2.16b  \n" /* choose*/             \
+  "bif    v15.16b,   v5.16b,   v4.16b  \n" /* choose*/             \
+  "bif    v16.16b,  v7.16b,   v6.16b  \n"  /* choose*/             \
+  "fcmge  v2.4s,    v17.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v3.4s,    v17.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v4.4s,    v18.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v5.4s,    v18.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v6.4s,    v19.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v7.4s,    v19.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "bif    v17.16b,   v3.16b,   v2.16b  \n" /* choose*/             \
+  "bif    v18.16b,   v5.16b,   v4.16b  \n" /* choose*/             \
+  "bif    v19.16b,  v7.16b,   v6.16b  \n"  /* choose*/             \
+  "fcmge  v2.4s,    v20.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v3.4s,    v20.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v4.4s,    v21.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v5.4s,    v21.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v6.4s,    v22.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v7.4s,    v22.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "bif    v20.16b,   v3.16b,   v2.16b  \n" /* choose*/             \
+  "bif    v21.16b,   v5.16b,   v4.16b  \n" /* choose*/             \
+  "bif    v22.16b,  v7.16b,   v6.16b  \n"  /* choose*/             \
+  "fcmge  v2.4s,    v23.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v3.4s,    v23.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v4.4s,    v24.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v5.4s,    v24.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v6.4s,    v25.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v7.4s,    v25.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "bif    v23.16b,   v3.16b,   v2.16b  \n" /* choose*/             \
+  "bif    v24.16b,   v5.16b,   v4.16b  \n" /* choose*/             \
+  "bif    v25.16b,  v7.16b,   v6.16b  \n"  /* choose*/             \
+  "fcmge  v2.4s,    v26.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v3.4s,    v26.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v4.4s,    v27.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v5.4s,    v27.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v6.4s,    v28.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v7.4s,    v28.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "bif    v26.16b,   v3.16b,   v2.16b  \n" /* choose*/             \
+  "bif    v27.16b,   v5.16b,   v4.16b  \n" /* choose*/             \
+  "bif    v28.16b,  v7.16b,   v6.16b  \n"  /* choose*/             \
+  "fcmge  v2.4s,    v29.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v3.4s,    v29.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v4.4s,    v30.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v5.4s,    v30.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v6.4s,    v31.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v7.4s,    v31.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "bif    v29.16b,   v3.16b,   v2.16b  \n" /* choose*/             \
+  "bif    v30.16b,   v5.16b,   v4.16b  \n" /* choose*/             \
+  "bif    v31.16b,  v7.16b,   v6.16b  \n"  /* choose*/             \
+  "b      9f                    \n"
+
+#define SPARSE_INT8_F32_W32_V8_SEMI2_LEAKY_RELU                     \
+  /* do relu */                                                     \
+  "11: \n"                                                          \
+  "cmp   %w[vflag_act],  #3       \n"       /* check leakey relu */ \
+  "bne   12f                     \n"        /* no act end */        \
+  "movi   v0.4s, #0\n"                      /* for relu6 */         \
+  "dup    v1.4s,  %w[valpha]\n"             /* leakey relu alpha */ \
+  "fcmge  v2.4s,    v16.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v3.4s,    v16.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v4.4s,    v17.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v5.4s,    v17.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v6.4s,    v18.4s,   v0.4s   \n"   /* vcgeq_f32 */         \
+  "fmul   v7.4s,    v18.4s,   v1.4s   \n"   /* vmulq_f32 */         \
+  "fcmge  v8.4s,    v19.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v9.4s,    v19.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v10.4s,    v20.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v11.4s,    v20.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v12.4s,    v21.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v13.4s,    v21.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v14.4s,    v22.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v15.4s,    v22.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "bif    v16.16b,   v3.16b,   v2.16b  \n"  /* choose*/             \
+  "bif    v17.16b,   v5.16b,   v4.16b  \n"  /* choose*/             \
+  "bif    v18.16b,  v7.16b,   v6.16b  \n"   /* choose*/             \
+  "bif    v19.16b,  v9.16b,   v8.16b  \n"   /* choose*/             \
+  "bif    v20.16b,  v11.16b,   v10.16b  \n" /* choose*/             \
+  "bif    v21.16b,  v13.16b,   v12.16b  \n" /* choose*/             \
+  "bif    v22.16b,  v15.16b,   v14.16b  \n" /* choose*/             \
+  "fcmge  v2.4s,    v23.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v3.4s,    v23.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v4.4s,    v24.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v5.4s,    v24.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v6.4s,    v25.4s,   v0.4s   \n"   /* vcgeq_f32 */         \
+  "fmul   v7.4s,    v25.4s,   v1.4s   \n"   /* vmulq_f32 */         \
+  "fcmge  v8.4s,    v26.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v9.4s,    v26.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v10.4s,    v27.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v11.4s,    v27.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v12.4s,    v28.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v13.4s,    v28.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v14.4s,    v29.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v15.4s,    v29.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "bif    v23.16b,   v3.16b,   v2.16b  \n"  /* choose*/             \
+  "bif    v24.16b,   v5.16b,   v4.16b  \n"  /* choose*/             \
+  "bif    v25.16b,  v7.16b,   v6.16b  \n"   /* choose*/             \
+  "bif    v26.16b,  v9.16b,   v8.16b  \n"   /* choose*/             \
+  "bif    v27.16b,  v11.16b,   v10.16b  \n" /* choose*/             \
+  "bif    v28.16b,  v13.16b,   v12.16b  \n" /* choose*/             \
+  "bif    v29.16b,  v15.16b,   v14.16b  \n" /* choose*/             \
+  "fcmge  v2.4s,    v30.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v3.4s,    v30.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v4.4s,    v31.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v5.4s,    v31.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "bif    v30.16b,   v3.16b,   v2.16b  \n"  /* choose*/             \
+  "bif    v31.16b,   v5.16b,   v4.16b  \n"  /* choose*/             \
+  "b      9f                    \n"
+
+#define SPARSE_INT8_F32_W32_V8_SEMI2_HARD_SWISH                  \
+  "12: \n"                                                       \
+  "movi   v0.4s,    #0                \n"    /* for hardswish */ \
+  "ldr    q1,  [%[hs_param], #0]         \n" /* offset */        \
+  "ldr    q2,  [%[hs_param], #16]        \n" /* scale */         \
+  "ldr    q3,  [%[hs_param], #32]        \n" /* threshold */     \
+  "fadd   v4.4s,  v16.4s, v1.4s        \n"                       \
+  "fadd   v6.4s,  v17.4s, v1.4s        \n"                       \
+  "fadd   v8.4s,  v18.4s, v1.4s        \n"                       \
+  "fadd   v10.4s,  v19.4s, v1.4s        \n"                      \
+  "fadd   v12.4s,  v20.4s, v1.4s        \n"                      \
+  "fadd   v14.4s,  v21.4s, v1.4s        \n"                      \
+  "fmul   v5.4s,   v16.4s, v2.4s        \n"                      \
+  "fmul   v7.4s,   v17.4s, v2.4s        \n"                      \
+  "fmul   v9.4s,   v18.4s, v2.4s        \n"                      \
+  "fmul   v11.4s,  v19.4s, v2.4s        \n"                      \
+  "fmul   v13.4s,  v20.4s, v2.4s        \n"                      \
+  "fmul   v15.4s,  v21.4s, v2.4s        \n"                      \
+  "fmax   v4.4s,  v4.4s, v0.4s          \n"                      \
+  "fmax   v6.4s,  v6.4s, v0.4s          \n"                      \
+  "fmax   v8.4s,  v8.4s, v0.4s          \n"                      \
+  "fmax   v10.4s, v10.4s, v0.4s         \n"                      \
+  "fmax   v12.4s, v12.4s, v0.4s         \n"                      \
+  "fmax   v14.4s, v14.4s, v0.4s         \n"                      \
+  "fmin   v4.4s,  v4.4s, v3.4s          \n"                      \
+  "fmin   v6.4s,  v6.4s, v3.4s          \n"                      \
+  "fmin   v8.4s,  v8.4s, v3.4s          \n"                      \
+  "fmin   v10.4s, v10.4s, v3.4s         \n"                      \
+  "fmin   v12.4s, v12.4s, v3.4s         \n"                      \
+  "fmin   v14.4s, v14.4s, v3.4s         \n"                      \
+  "fmul   v16.4s,  v5.4s,  v4.4s        \n"                      \
+  "fmul   v17.4s,  v7.4s,  v6.4s        \n"                      \
+  "fmul   v18.4s,  v9.4s,  v8.4s        \n"                      \
+  "fmul   v19.4s,  v11.4s, v10.4s        \n"                     \
+  "fmul   v20.4s,  v13.4s, v12.4s        \n"                     \
+  "fmul   v21.4s,  v15.4s, v14.4s        \n"                     \
+  "fadd   v4.4s,  v22.4s, v1.4s        \n"                       \
+  "fadd   v6.4s,  v23.4s, v1.4s        \n"                       \
+  "fadd   v8.4s,  v24.4s, v1.4s        \n"                       \
+  "fadd   v10.4s,  v25.4s, v1.4s        \n"                      \
+  "fadd   v12.4s,  v26.4s, v1.4s        \n"                      \
+  "fadd   v14.4s,  v27.4s, v1.4s        \n"                      \
+  "fmul   v5.4s,   v22.4s, v2.4s        \n"                      \
+  "fmul   v7.4s,   v23.4s, v2.4s        \n"                      \
+  "fmul   v9.4s,   v24.4s, v2.4s        \n"                      \
+  "fmul   v11.4s,  v25.4s, v2.4s        \n"                      \
+  "fmul   v13.4s,  v26.4s, v2.4s        \n"                      \
+  "fmul   v15.4s,  v27.4s, v2.4s        \n"                      \
+  "fmax   v4.4s,  v4.4s, v0.4s          \n"                      \
+  "fmax   v6.4s,  v6.4s, v0.4s          \n"                      \
+  "fmax   v8.4s,  v8.4s, v0.4s          \n"                      \
+  "fmax   v10.4s, v10.4s, v0.4s         \n"                      \
+  "fmax   v12.4s, v12.4s, v0.4s         \n"                      \
+  "fmax   v14.4s, v14.4s, v0.4s         \n"                      \
+  "fmin   v4.4s,  v4.4s, v3.4s          \n"                      \
+  "fmin   v6.4s,  v6.4s, v3.4s          \n"                      \
+  "fmin   v8.4s,  v8.4s, v3.4s          \n"                      \
+  "fmin   v10.4s, v10.4s, v3.4s         \n"                      \
+  "fmin   v12.4s, v12.4s, v3.4s         \n"                      \
+  "fmin   v14.4s, v14.4s, v3.4s         \n"                      \
+  "fmul   v22.4s,  v5.4s,  v4.4s        \n"                      \
+  "fmul   v23.4s,  v7.4s,  v6.4s        \n"                      \
+  "fmul   v24.4s,  v9.4s,  v8.4s        \n"                      \
+  "fmul   v25.4s,  v11.4s, v10.4s        \n"                     \
+  "fmul   v26.4s,  v13.4s, v12.4s        \n"                     \
+  "fmul   v27.4s,  v15.4s, v14.4s        \n"                     \
+  "fadd   v4.4s,  v28.4s, v1.4s        \n"                       \
+  "fadd   v6.4s,  v29.4s, v1.4s        \n"                       \
+  "fadd   v8.4s,  v30.4s, v1.4s        \n"                       \
+  "fadd   v10.4s,  v31.4s, v1.4s        \n"                      \
+  "fmul   v5.4s,   v28.4s, v2.4s        \n"                      \
+  "fmul   v7.4s,   v29.4s, v2.4s        \n"                      \
+  "fmul   v9.4s,   v30.4s, v2.4s        \n"                      \
+  "fmul   v11.4s,  v31.4s, v2.4s        \n"                      \
+  "fmax   v4.4s,  v4.4s, v0.4s          \n"                      \
+  "fmax   v6.4s,  v6.4s, v0.4s          \n"                      \
+  "fmax   v8.4s,  v8.4s, v0.4s          \n"                      \
+  "fmax   v10.4s, v10.4s, v0.4s         \n"                      \
+  "fmin   v4.4s,  v4.4s, v3.4s          \n"                      \
+  "fmin   v6.4s,  v6.4s, v3.4s          \n"                      \
+  "fmin   v8.4s,  v8.4s, v3.4s          \n"                      \
+  "fmin   v10.4s, v10.4s, v3.4s         \n"                      \
+  "fmul   v28.4s,  v5.4s,  v4.4s        \n"                      \
+  "fmul   v29.4s,  v7.4s,  v6.4s        \n"                      \
+  "fmul   v30.4s,  v9.4s,  v8.4s        \n"                      \
+  "fmul   v31.4s,  v11.4s, v10.4s        \n"                     \
+  "9:\n"
+
+#define SPARSE_INT8_F32_W16_V8_SEMI2_LEAKY_RELU                     \
+  /* do relu */                                                     \
+  "11: \n"                                                          \
+  "cmp   %w[vflag_act],  #3       \n"       /* check leakey relu */ \
+  "bne   12f                     \n"        /* no act end */        \
+  "movi   v0.4s, #0\n"                      /* for relu6 */         \
+  "dup    v1.4s,  %w[valpha]\n"             /* leakey relu alpha */ \
+  "fcmge  v2.4s,    v24.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v3.4s,    v24.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v4.4s,    v25.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v5.4s,    v25.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v6.4s,    v26.4s,   v0.4s   \n"   /* vcgeq_f32 */         \
+  "fmul   v7.4s,    v26.4s,   v1.4s   \n"   /* vmulq_f32 */         \
+  "fcmge  v8.4s,    v27.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v9.4s,    v27.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v10.4s,   v28.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v11.4s,   v28.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v12.4s,   v29.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v13.4s,   v29.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v14.4s,   v30.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v15.4s,   v30.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v16.4s,   v31.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v17.4s,   v31.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "bif    v24.16b,  v3.16b,    v2.16b  \n"  /* choose*/             \
+  "bif    v25.16b,  v5.16b,    v4.16b  \n"  /* choose*/             \
+  "bif    v26.16b,  v7.16b,    v6.16b  \n"  /* choose*/             \
+  "bif    v27.16b,  v9.16b,    v8.16b  \n"  /* choose*/             \
+  "bif    v28.16b,  v11.16b,   v10.16b  \n" /* choose*/             \
+  "bif    v29.16b,  v13.16b,   v12.16b  \n" /* choose*/             \
+  "bif    v30.16b,  v15.16b,   v14.16b  \n" /* choose*/             \
+  "bif    v31.16b,  v17.16b,   v16.16b  \n" /* choose*/             \
+  "b      9f                    \n"
+
+#define SPARSE_INT8_F32_W16_V8_SEMI2_HARD_SWISH                  \
+  "12: \n"                                                       \
+  "movi   v0.4s,    #0                \n"    /* for hardswish */ \
+  "ldr    q1,  [%[hs_param], #0]         \n" /* offset */        \
+  "ldr    q2,  [%[hs_param], #16]        \n" /* scale */         \
+  "ldr    q3,  [%[hs_param], #32]        \n" /* threshold */     \
+  "fadd   v4.4s,  v24.4s, v1.4s        \n"                       \
+  "fadd   v6.4s,  v25.4s, v1.4s        \n"                       \
+  "fadd   v8.4s,  v26.4s, v1.4s        \n"                       \
+  "fadd   v10.4s,  v27.4s, v1.4s        \n"                      \
+  "fadd   v12.4s,  v28.4s, v1.4s        \n"                      \
+  "fadd   v14.4s,  v29.4s, v1.4s        \n"                      \
+  "fadd   v16.4s,  v30.4s, v1.4s        \n"                      \
+  "fadd   v18.4s,  v31.4s, v1.4s        \n"                      \
+  "fmul   v5.4s,   v24.4s, v2.4s        \n"                      \
+  "fmul   v7.4s,   v25.4s, v2.4s        \n"                      \
+  "fmul   v9.4s,   v26.4s, v2.4s        \n"                      \
+  "fmul   v11.4s,  v27.4s, v2.4s        \n"                      \
+  "fmul   v13.4s,  v28.4s, v2.4s        \n"                      \
+  "fmul   v15.4s,  v29.4s, v2.4s        \n"                      \
+  "fmul   v17.4s,  v30.4s, v2.4s        \n"                      \
+  "fmul   v19.4s,  v31.4s, v2.4s        \n"                      \
+  "fmax   v4.4s,  v4.4s, v0.4s          \n"                      \
+  "fmax   v6.4s,  v6.4s, v0.4s          \n"                      \
+  "fmax   v8.4s,  v8.4s, v0.4s          \n"                      \
+  "fmax   v10.4s, v10.4s, v0.4s         \n"                      \
+  "fmax   v12.4s, v12.4s, v0.4s         \n"                      \
+  "fmax   v14.4s, v14.4s, v0.4s         \n"                      \
+  "fmax   v16.4s, v16.4s, v0.4s         \n"                      \
+  "fmax   v18.4s, v18.4s, v0.4s         \n"                      \
+  "fmin   v4.4s,  v4.4s, v3.4s          \n"                      \
+  "fmin   v6.4s,  v6.4s, v3.4s          \n"                      \
+  "fmin   v8.4s,  v8.4s, v3.4s          \n"                      \
+  "fmin   v10.4s, v10.4s, v3.4s         \n"                      \
+  "fmin   v12.4s, v12.4s, v3.4s         \n"                      \
+  "fmin   v14.4s, v14.4s, v3.4s         \n"                      \
+  "fmin   v16.4s, v16.4s, v3.4s         \n"                      \
+  "fmin   v18.4s, v18.4s, v3.4s         \n"                      \
+  "fmul   v24.4s,  v5.4s,  v4.4s        \n"                      \
+  "fmul   v25.4s,  v7.4s,  v6.4s        \n"                      \
+  "fmul   v26.4s,  v9.4s,  v8.4s        \n"                      \
+  "fmul   v27.4s,  v11.4s, v10.4s        \n"                     \
+  "fmul   v28.4s,  v13.4s, v12.4s        \n"                     \
+  "fmul   v29.4s,  v15.4s, v14.4s        \n"                     \
+  "fmul   v30.4s,  v17.4s, v16.4s        \n"                     \
+  "fmul   v31.4s,  v19.4s, v18.4s        \n"                     \
+  "9:\n"
+
+#define SPARSE_INT8_F32_W8_V8_SEMI2_LEAKY_RELU                      \
+  /* do relu */                                                     \
+  "11: \n"                                                          \
+  "cmp   %w[vflag_act],  #3       \n"       /* check leakey relu */ \
+  "bne   12f                     \n"        /* no act end */        \
+  "movi   v0.4s, #0\n"                      /* for relu6 */         \
+  "dup    v1.4s,  %w[valpha]\n"             /* leakey relu alpha */ \
+  "fcmge  v10.4s,   v28.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v11.4s,   v28.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v12.4s,   v29.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v13.4s,   v29.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v14.4s,   v30.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v15.4s,   v30.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v16.4s,   v31.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v17.4s,   v31.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "bif    v28.16b,  v11.16b,   v10.16b  \n" /* choose*/             \
+  "bif    v29.16b,  v13.16b,   v12.16b  \n" /* choose*/             \
+  "bif    v30.16b,  v15.16b,   v14.16b  \n" /* choose*/             \
+  "bif    v31.16b,  v17.16b,   v16.16b  \n" /* choose*/             \
+  "b      9f                    \n"
+
+#define SPARSE_INT8_F32_W8_V8_SEMI2_HARD_SWISH                   \
+  "12: \n"                                                       \
+  "movi   v0.4s,    #0                \n"    /* for hardswish */ \
+  "ldr    q1,  [%[hs_param], #0]         \n" /* offset */        \
+  "ldr    q2,  [%[hs_param], #16]        \n" /* scale */         \
+  "ldr    q3,  [%[hs_param], #32]        \n" /* threshold */     \
+  "fadd   v4.4s,  v28.4s, v1.4s        \n"                       \
+  "fadd   v6.4s,  v29.4s, v1.4s        \n"                       \
+  "fadd   v8.4s,  v30.4s, v1.4s        \n"                       \
+  "fadd   v10.4s,  v31.4s, v1.4s        \n"                      \
+  "fmul   v5.4s,   v28.4s, v2.4s        \n"                      \
+  "fmul   v7.4s,   v29.4s, v2.4s        \n"                      \
+  "fmul   v9.4s,   v30.4s, v2.4s        \n"                      \
+  "fmul   v11.4s,  v31.4s, v2.4s        \n"                      \
+  "fmax   v4.4s,  v4.4s, v0.4s          \n"                      \
+  "fmax   v6.4s,  v6.4s, v0.4s          \n"                      \
+  "fmax   v8.4s,  v8.4s, v0.4s          \n"                      \
+  "fmax   v10.4s, v10.4s, v0.4s         \n"                      \
+  "fmin   v4.4s,  v4.4s, v3.4s          \n"                      \
+  "fmin   v6.4s,  v6.4s, v3.4s          \n"                      \
+  "fmin   v8.4s,  v8.4s, v3.4s          \n"                      \
+  "fmin   v10.4s, v10.4s, v3.4s         \n"                      \
+  "fmul   v28.4s,  v5.4s,  v4.4s        \n"                      \
+  "fmul   v29.4s,  v7.4s,  v6.4s        \n"                      \
+  "fmul   v30.4s,  v9.4s,  v8.4s        \n"                      \
+  "fmul   v31.4s,  v11.4s, v10.4s        \n"                     \
+  "9:\n"
+
+#define SPARSE_INT8_F32_W4_V8_SEMI2_LEAKY_RELU                      \
+  /* do relu */                                                     \
+  "11: \n"                                                          \
+  "cmp   %w[vflag_act],  #3       \n"       /* check leakey relu */ \
+  "bne   12f                     \n"        /* no act end */        \
+  "movi   v0.4s, #0\n"                      /* for relu6 */         \
+  "dup    v1.4s,  %w[valpha]\n"             /* leakey relu alpha */ \
+  "fcmge  v14.4s,   v30.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v15.4s,   v30.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v16.4s,   v31.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v17.4s,   v31.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "bif    v30.16b,  v15.16b,   v14.16b  \n" /* choose*/             \
+  "bif    v31.16b,  v17.16b,   v16.16b  \n" /* choose*/             \
+  "b      9f                    \n"
+
+#define SPARSE_INT8_F32_W4_V8_SEMI2_HARD_SWISH                   \
+  "12: \n"                                                       \
+  "movi   v0.4s,    #0                \n"    /* for hardswish */ \
+  "ldr    q1,  [%[hs_param], #0]         \n" /* offset */        \
+  "ldr    q2,  [%[hs_param], #16]        \n" /* scale */         \
+  "ldr    q3,  [%[hs_param], #32]        \n" /* threshold */     \
+  "fadd   v4.4s,  v30.4s, v1.4s        \n"                       \
+  "fadd   v6.4s,  v31.4s, v1.4s        \n"                       \
+  "fmul   v5.4s,   v30.4s, v2.4s        \n"                      \
+  "fmul   v7.4s,   v31.4s, v2.4s        \n"                      \
+  "fmax   v4.4s,  v4.4s, v0.4s          \n"                      \
+  "fmax   v6.4s,  v6.4s, v0.4s          \n"                      \
+  "fmin   v4.4s,  v4.4s, v3.4s          \n"                      \
+  "fmin   v6.4s,  v6.4s, v3.4s          \n"                      \
+  "fmul   v30.4s,  v5.4s,  v4.4s        \n"                      \
+  "fmul   v31.4s,  v7.4s,  v6.4s        \n"                      \
+  "9:\n"
+
+#define SPARSE_INT8_F32_W32_V8_SEMI1_LEAKY_RELU                    \
+  /* do relu */                                                    \
+  "11: \n"                                                         \
+  "cmp   %w[vflag_act],  #3       \n"      /* check leakey relu */ \
+  "bne   12f                     \n"       /* no act end */        \
+  "movi   v0.4s, #0\n"                     /* for relu6 */         \
+  "dup    v1.4s,  %w[valpha]\n"            /* leakey relu alpha */ \
+  "fcmge  v2.4s,    v21.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v3.4s,    v21.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v4.4s,    v22.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v5.4s,    v22.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v6.4s,    v23.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v7.4s,    v23.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v8.4s,    v24.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v9.4s,    v24.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "bif    v21.16b,   v3.16b,   v2.16b  \n" /* choose*/             \
+  "bif    v22.16b,   v5.16b,   v4.16b  \n" /* choose*/             \
+  "bif    v23.16b,  v7.16b,   v6.16b  \n"  /* choose*/             \
+  "bif    v24.16b,  v9.16b,   v8.16b  \n"  /* choose*/             \
+  "fcmge  v2.4s,    v25.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v3.4s,    v25.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v4.4s,    v26.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v5.4s,    v26.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v6.4s,    v27.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v7.4s,    v27.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v8.4s,    v28.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v9.4s,    v28.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "bif    v25.16b,   v3.16b,   v2.16b  \n" /* choose*/             \
+  "bif    v26.16b,   v5.16b,   v4.16b  \n" /* choose*/             \
+  "bif    v27.16b,  v7.16b,   v6.16b  \n"  /* choose*/             \
+  "bif    v28.16b,  v9.16b,   v8.16b  \n"  /* choose*/             \
+  "b      9f                    \n"
+
+#define SPARSE_INT8_F32_W32_V8_SEMI1_HARD_SWISH                  \
+  /* do hard_swish */                                            \
+  "12: \n"                                                       \
+  "movi   v0.4s,    #0                \n"    /* for hardswish */ \
+  "ldr    q1,  [%[hs_param], #0]         \n" /* offset */        \
+  "ldr    q2,  [%[hs_param], #16]        \n" /* scale */         \
+  "ldr    q3,  [%[hs_param], #32]        \n" /* threshold */     \
+  "fadd   v6.4s,  v21.4s, v1.4s        \n"                       \
+  "fadd   v8.4s,  v22.4s, v1.4s        \n"                       \
+  "fadd   v10.4s,  v23.4s, v1.4s        \n"                      \
+  "fadd   v12.4s,  v24.4s, v1.4s        \n"                      \
+  "fadd   v14.4s,  v25.4s, v1.4s        \n"                      \
+  "fadd   v16.4s,  v26.4s, v1.4s        \n"                      \
+  "fadd   v18.4s,  v27.4s, v1.4s        \n"                      \
+  "fadd   v30.4s,  v28.4s, v1.4s        \n"                      \
+  "fmul   v7.4s,   v21.4s, v2.4s        \n"                      \
+  "fmul   v9.4s,   v22.4s, v2.4s        \n"                      \
+  "fmul   v11.4s,  v23.4s, v2.4s        \n"                      \
+  "fmul   v13.4s,  v24.4s, v2.4s        \n"                      \
+  "fmul   v15.4s,  v25.4s, v2.4s        \n"                      \
+  "fmul   v17.4s,  v26.4s, v2.4s        \n"                      \
+  "fmul   v19.4s,  v27.4s, v2.4s        \n"                      \
+  "fmul   v31.4s,  v28.4s, v2.4s        \n"                      \
+  "fmax   v6.4s,  v6.4s, v0.4s        \n"                        \
+  "fmax   v8.4s,  v8.4s, v0.4s        \n"                        \
+  "fmax   v10.4s, v10.4s, v0.4s       \n"                        \
+  "fmax   v12.4s, v12.4s, v0.4s       \n"                        \
+  "fmax   v14.4s, v14.4s, v0.4s       \n"                        \
+  "fmax   v16.4s, v16.4s, v0.4s       \n"                        \
+  "fmax   v18.4s, v18.4s, v0.4s       \n"                        \
+  "fmax   v30.4s, v30.4s, v0.4s       \n"                        \
+  "fmin   v6.4s,  v6.4s, v3.4s        \n"                        \
+  "fmin   v8.4s,  v8.4s, v3.4s        \n"                        \
+  "fmin   v10.4s, v10.4s, v3.4s       \n"                        \
+  "fmin   v12.4s, v12.4s, v3.4s       \n"                        \
+  "fmin   v14.4s, v14.4s, v3.4s       \n"                        \
+  "fmin   v16.4s, v16.4s, v3.4s       \n"                        \
+  "fmin   v18.4s, v18.4s, v3.4s       \n"                        \
+  "fmin   v30.4s, v30.4s, v3.4s       \n"                        \
+  "fmul   v21.4s,  v7.4s,  v6.4s        \n"                      \
+  "fmul   v22.4s,  v9.4s,  v8.4s        \n"                      \
+  "fmul   v23.4s,  v11.4s, v10.4s        \n"                     \
+  "fmul   v24.4s,  v13.4s, v12.4s        \n"                     \
+  "fmul   v25.4s,  v15.4s, v14.4s        \n"                     \
+  "fmul   v26.4s,  v17.4s, v16.4s        \n"                     \
+  "fmul   v27.4s,  v19.4s, v18.4s        \n"                     \
+  "fmul   v28.4s,  v31.4s, v30.4s        \n"                     \
+  "9:\n"
+
+#define SPARSE_INT8_F32_W16_V8_SEMI1_LEAKY_RELU                    \
+  /* do relu */                                                    \
+  "11: \n"                                                         \
+  "cmp   %w[vflag_act],  #3       \n"      /* check leakey relu */ \
+  "bne   12f                     \n"       /* no act end */        \
+  "movi   v0.4s, #0\n"                     /* for relu6 */         \
+  "dup    v1.4s,  %w[valpha]\n"            /* leakey relu alpha */ \
+  "fcmge  v2.4s,    v21.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v3.4s,    v21.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v4.4s,    v22.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v5.4s,    v22.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v6.4s,    v23.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v7.4s,    v23.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v8.4s,    v24.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v9.4s,    v24.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "bif    v21.16b,   v3.16b,   v2.16b  \n" /* choose*/             \
+  "bif    v22.16b,   v5.16b,   v4.16b  \n" /* choose*/             \
+  "bif    v23.16b,  v7.16b,   v6.16b  \n"  /* choose*/             \
+  "bif    v24.16b,  v9.16b,   v8.16b  \n"  /* choose*/             \
+  "b      9f                    \n"
+
+#define SPARSE_INT8_F32_W16_V8_SEMI1_HARD_SWISH                  \
+  "12: \n"                                                       \
+  "movi   v0.4s,    #0                \n"    /* for hardswish */ \
+  "ldr    q1,  [%[hs_param], #0]         \n" /* offset */        \
+  "ldr    q2,  [%[hs_param], #16]        \n" /* scale */         \
+  "ldr    q3,  [%[hs_param], #32]        \n" /* threshold */     \
+  "fadd   v6.4s,  v21.4s, v1.4s        \n"                       \
+  "fadd   v8.4s,  v22.4s, v1.4s        \n"                       \
+  "fadd   v10.4s,  v23.4s, v1.4s        \n"                      \
+  "fadd   v12.4s,  v24.4s, v1.4s        \n"                      \
+  "fmul   v7.4s,   v21.4s, v2.4s        \n"                      \
+  "fmul   v9.4s,   v22.4s, v2.4s        \n"                      \
+  "fmul   v11.4s,  v23.4s, v2.4s        \n"                      \
+  "fmul   v13.4s,  v24.4s, v2.4s        \n"                      \
+  "fmax   v6.4s,  v6.4s, v0.4s        \n"                        \
+  "fmax   v8.4s,  v8.4s, v0.4s        \n"                        \
+  "fmax   v10.4s, v10.4s, v0.4s       \n"                        \
+  "fmax   v12.4s, v12.4s, v0.4s       \n"                        \
+  "fmin   v6.4s,  v6.4s, v3.4s        \n"                        \
+  "fmin   v8.4s,  v8.4s, v3.4s        \n"                        \
+  "fmin   v10.4s, v10.4s, v3.4s       \n"                        \
+  "fmin   v12.4s, v12.4s, v3.4s       \n"                        \
+  "fmul   v21.4s,  v7.4s,  v6.4s        \n"                      \
+  "fmul   v22.4s,  v9.4s,  v8.4s        \n"                      \
+  "fmul   v23.4s,  v11.4s, v10.4s        \n"                     \
+  "fmul   v24.4s,  v13.4s, v12.4s        \n"                     \
+  "9:\n"
+
+#define SPARSE_INT8_F32_W8_V8_SEMI1_LEAKY_RELU                     \
+  /* do relu */                                                    \
+  "11: \n"                                                         \
+  "cmp   %w[vflag_act],  #3       \n"      /* check leakey relu */ \
+  "bne   12f                     \n"       /* no act end */        \
+  "movi   v0.4s, #0\n"                     /* for relu6 */         \
+  "dup    v1.4s,  %w[valpha]\n"            /* leakey relu alpha */ \
+  "fcmge  v2.4s,    v21.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v3.4s,    v21.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v4.4s,    v22.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v5.4s,    v22.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "bif    v21.16b,   v3.16b,   v2.16b  \n" /* choose*/             \
+  "bif    v22.16b,   v5.16b,   v4.16b  \n" /* choose*/             \
+  "b      9f                    \n"
+
+#define SPARSE_INT8_F32_W8_V8_SEMI1_HARD_SWISH                   \
+  "12: \n"                                                       \
+  "movi   v0.4s,    #0                \n"    /* for hardswish */ \
+  "ldr    q1,  [%[hs_param], #0]         \n" /* offset */        \
+  "ldr    q2,  [%[hs_param], #16]        \n" /* scale */         \
+  "ldr    q3,  [%[hs_param], #32]        \n" /* threshold */     \
+  "fadd   v6.4s,  v21.4s, v1.4s        \n"                       \
+  "fadd   v8.4s,  v22.4s, v1.4s        \n"                       \
+  "fmul   v7.4s,   v21.4s, v2.4s        \n"                      \
+  "fmul   v9.4s,   v22.4s, v2.4s        \n"                      \
+  "fmax   v6.4s,  v6.4s, v0.4s        \n"                        \
+  "fmax   v8.4s,  v8.4s, v0.4s        \n"                        \
+  "fmin   v6.4s,  v6.4s, v3.4s        \n"                        \
+  "fmin   v8.4s,  v8.4s, v3.4s        \n"                        \
+  "fmul   v21.4s,  v7.4s,  v6.4s        \n"                      \
+  "fmul   v22.4s,  v9.4s,  v8.4s        \n"                      \
+  "9:\n"
+
+#define SPARSE_INT8_F32_W4_V8_SEMI1_LEAKY_RELU                     \
+  /* do relu */                                                    \
+  "11: \n"                                                         \
+  "cmp   %w[vflag_act],  #3       \n"      /* check leakey relu */ \
+  "bne   12f                     \n"       /* no act end */        \
+  "movi   v0.4s, #0\n"                     /* for relu6 */         \
+  "dup    v1.4s,  %w[valpha]\n"            /* leakey relu alpha */ \
+  "fcmge  v2.4s,    v21.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v3.4s,    v21.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "bif    v21.16b,   v3.16b,   v2.16b  \n" /* choose*/             \
+  "b      9f                    \n"
+
+#define SPARSE_INT8_F32_W4_V8_SEMI1_HARD_SWISH                   \
+  "12: \n"                                                       \
+  "movi   v0.4s,    #0                \n"    /* for hardswish */ \
+  "ldr    q1,  [%[hs_param], #0]         \n" /* offset */        \
+  "ldr    q2,  [%[hs_param], #16]        \n" /* scale */         \
+  "ldr    q3,  [%[hs_param], #32]        \n" /* threshold */     \
+  "fadd   v6.4s,  v21.4s, v1.4s        \n"                       \
+  "fmul   v7.4s,   v21.4s, v2.4s        \n"                      \
+  "fmax   v6.4s,  v6.4s, v0.4s        \n"                        \
+  "fmin   v6.4s,  v6.4s, v3.4s        \n"                        \
+  "fmul   v21.4s,  v7.4s,  v6.4s        \n"                      \
+  "9:\n"
+
+#define SPARSE_INT8_F32_W48_V8_SEMI_OUT                                        \
+  SPARSE_INT8_F32_W48_V8_SEMI_KERNEL                                           \
+  SPARSE_INT8_F32_W48_V8_SEMI_RELU                                             \
+  SPARSE_INT8_F32_W48_V8_SEMI_RELU6                                            \
+  SPARSE_INT8_F32_W48_V8_SEMI_LEAKY_RELU                                       \
+  /* store result */                                                           \
+  "ld1    {v0.4s},   [%[vmax]] \n" /* v8 = -127 */ /* data >= -127 */          \
+  "fcmge v1.4s,  v8.4s, v0.4s\n"                                               \
+  "fcmge v2.4s,  v9.4s, v0.4s\n"                                               \
+  "fcmge v3.4s,  v10.4s, v0.4s\n"                                              \
+  "fcmge v4.4s,  v11.4s, v0.4s\n"                                              \
+  "fcmge v5.4s,  v12.4s, v0.4s\n"                                              \
+  "fcmge v6.4s,  v13.4s, v0.4s\n"                                              \
+  "fcmge v7.4s,  v14.4s, v0.4s\n"                                              \
+  "bif v8.16b,  v0.16b, v1.16b           \n"                                   \
+  "bif v9.16b,  v0.16b, v2.16b           \n"                                   \
+  "bif v10.16b,  v0.16b, v3.16b           \n"                                  \
+  "bif v11.16b,  v0.16b, v4.16b           \n"                                  \
+  "bif v12.16b,  v0.16b, v5.16b           \n"                                  \
+  "bif v13.16b,  v0.16b, v6.16b           \n"                                  \
+  "bif v14.16b,  v0.16b, v7.16b           \n"                                  \
+  "fcmge v1.4s,  v15.4s, v0.4s\n"                                              \
+  "fcmge v2.4s,  v16.4s, v0.4s\n"                                              \
+  "fcmge v3.4s,  v17.4s, v0.4s\n"                                              \
+  "fcmge v4.4s,  v18.4s, v0.4s\n"                                              \
+  "fcmge v5.4s,  v19.4s, v0.4s\n"                                              \
+  "fcmge v6.4s,  v20.4s, v0.4s\n"                                              \
+  "fcmge v7.4s,  v21.4s, v0.4s\n"                                              \
+  "bif v15.16b,  v0.16b, v1.16b           \n"                                  \
+  "bif v16.16b,  v0.16b, v2.16b           \n"                                  \
+  "bif v17.16b,  v0.16b, v3.16b           \n"                                  \
+  "bif v18.16b,  v0.16b, v4.16b           \n"                                  \
+  "bif v19.16b,  v0.16b, v5.16b           \n"                                  \
+  "bif v20.16b,  v0.16b, v6.16b           \n"                                  \
+  "bif v21.16b,  v0.16b, v7.16b           \n"                                  \
+  "fcmge v1.4s,  v22.4s, v0.4s\n"                                              \
+  "fcmge v2.4s,  v23.4s, v0.4s\n"                                              \
+  "fcmge v3.4s,  v24.4s, v0.4s\n"                                              \
+  "fcmge v4.4s,  v25.4s, v0.4s\n"                                              \
+  "fcmge v5.4s,  v26.4s, v0.4s\n"                                              \
+  "fcmge v6.4s,  v27.4s, v0.4s\n"                                              \
+  "fcmge v7.4s,  v28.4s, v0.4s\n"                                              \
+  "bif v22.16b,  v0.16b, v1.16b           \n"                                  \
+  "bif v23.16b,  v0.16b, v2.16b           \n"                                  \
+  "bif v24.16b,  v0.16b, v3.16b           \n"                                  \
+  "bif v25.16b,  v0.16b, v4.16b           \n"                                  \
+  "bif v26.16b,  v0.16b, v5.16b           \n"                                  \
+  "bif v27.16b,  v0.16b, v6.16b           \n"                                  \
+  "bif v28.16b,  v0.16b, v7.16b           \n"                                  \
+  "fcmge v1.4s,  v29.4s, v0.4s\n"                                              \
+  "fcmge v2.4s,  v30.4s, v0.4s\n"                                              \
+  "fcmge v3.4s,  v31.4s, v0.4s\n"                                              \
+  "bif v29.16b,  v0.16b, v1.16b           \n"                                  \
+  "bif v30.16b,  v0.16b, v2.16b           \n"                                  \
+  "bif v31.16b,  v0.16b, v3.16b           \n"                                  \
+  "fcvtas v0.4s, v8.4s\n"   /*  cvt to int */                                  \
+  "fcvtas v1.4s, v9.4s\n"   /*  cvt to int */                                  \
+  "fcvtas v2.4s, v10.4s\n"  /*  cvt to int */                                  \
+  "fcvtas v3.4s, v11.4s\n"  /*  cvt to int */                                  \
+  "fcvtas v4.4s, v12.4s\n"  /*  cvt to int */                                  \
+  "fcvtas v5.4s, v13.4s\n"  /*  cvt to int */                                  \
+  "fcvtas v6.4s, v14.4s\n"  /*  cvt to int */                                  \
+  "fcvtas v7.4s, v15.4s\n"  /*  cvt to int */                                  \
+  "sqxtn  v8.4h, v0.4s\n"   /*  cvt int32 to int16 */                          \
+  "sqxtn2 v8.8h, v1.4s\n"   /*  cvt int32 to int16 */                          \
+  "sqxtn  v9.4h, v2.4s\n"   /*  cvt int32 to int16 */                          \
+  "sqxtn2 v9.8h, v3.4s\n"   /*  cvt int32 to int16 */                          \
+  "sqxtn  v10.4h, v4.4s\n"  /*  cvt int32 to int16 */                          \
+  "sqxtn2 v10.8h, v5.4s\n"  /*  cvt int32 to int16 */                          \
+  "sqxtn  v11.4h, v6.4s\n"  /*  cvt int32 to int16 */                          \
+  "sqxtn2 v11.8h, v7.4s\n"  /*  cvt int32 to int16 */                          \
+  "sqxtn  v12.8b, v8.8h\n"  /*  cvt int16 to int8 */                           \
+  "sqxtn2 v12.16b,v9.8h\n"  /*  cvt int16 to int8 */                           \
+  "sqxtn  v13.8b, v10.8h\n" /*  cvt int16 to int8 */                           \
+  "sqxtn2 v13.16b,v11.8h\n" /*  cvt int16 to int8 */                           \
+  "fcvtas v0.4s, v16.4s\n"  /*  cvt to int */                                  \
+  "fcvtas v1.4s, v17.4s\n"  /*  cvt to int */                                  \
+  "fcvtas v2.4s, v18.4s\n"  /*  cvt to int */                                  \
+  "fcvtas v3.4s, v19.4s\n"  /*  cvt to int */                                  \
+  "fcvtas v4.4s, v20.4s\n"  /*  cvt to int */                                  \
+  "fcvtas v5.4s, v21.4s\n"  /*  cvt to int */                                  \
+  "stp   q12, q13,  [%[c_ptr1]]\n"                                             \
+  "fcvtas v6.4s, v22.4s\n"   /*  cvt to int */                                 \
+  "fcvtas v7.4s, v23.4s\n"   /*  cvt to int */                                 \
+  "fcvtas v8.4s, v24.4s\n"   /*  cvt to int */                                 \
+  "fcvtas v9.4s, v25.4s\n"   /*  cvt to int */                                 \
+  "fcvtas v10.4s, v26.4s\n"  /*  cvt to int */                                 \
+  "fcvtas v11.4s, v27.4s\n"  /*  cvt to int */                                 \
+  "fcvtas v12.4s, v28.4s\n"  /*  cvt to int */                                 \
+  "fcvtas v13.4s, v29.4s\n"  /*  cvt to int */                                 \
+  "fcvtas v14.4s, v30.4s\n"  /*  cvt to int */                                 \
+  "fcvtas v15.4s, v31.4s\n"  /*  cvt to int */                                 \
+  "sqxtn  v16.4h, v0.4s\n"   /*  cvt int32 to int16 */                         \
+  "sqxtn2 v16.8h, v1.4s\n"   /*  cvt int32 to int16 */                         \
+  "sqxtn  v17.4h, v2.4s\n"   /*  cvt int32 to int16 */                         \
+  "sqxtn2 v17.8h, v3.4s\n"   /*  cvt int32 to int16 */                         \
+  "sqxtn  v18.4h, v4.4s\n"   /*  cvt int32 to int16 */                         \
+  "sqxtn2 v18.8h, v5.4s\n"   /*  cvt int32 to int16 */                         \
+  "sqxtn  v19.4h, v6.4s\n"   /*  cvt int32 to int16 */                         \
+  "sqxtn2 v19.8h, v7.4s\n"   /*  cvt int32 to int16 */                         \
+  "sqxtn  v20.4h, v8.4s\n"   /*  cvt int32 to int16 */                         \
+  "sqxtn2 v20.8h, v9.4s\n"   /*  cvt int32 to int16 */                         \
+  "sqxtn  v21.4h, v10.4s\n"  /*  cvt int32 to int16 */                         \
+  "sqxtn2 v21.8h, v11.4s\n"  /*  cvt int32 to int16 */                         \
+  "sqxtn  v22.4h, v12.4s\n"  /*  cvt int32 to int16 */                         \
+  "sqxtn2 v22.8h, v13.4s\n"  /*  cvt int32 to int16 */                         \
+  "sqxtn  v23.4h, v14.4s\n"  /*  cvt int32 to int16 */                         \
+  "sqxtn2 v23.8h, v15.4s\n"  /*  cvt int32 to int16 */                         \
+  "sqxtn  v24.8b, v16.8h\n"  /*  cvt int16 to int8 */                          \
+  "sqxtn2 v24.16b, v17.8h\n" /*  cvt int16 to int8 */                          \
+  "sqxtn  v25.8b, v18.8h\n"  /*  cvt int16 to int8 */                          \
+  "sqxtn2 v25.16b, v19.8h\n" /*  cvt int16 to int8 */                          \
+  "sqxtn  v26.8b, v20.8h\n"  /*  cvt int16 to int8 */                          \
+  "sqxtn2 v26.16b, v21.8h\n" /*  cvt int16 to int8 */ /* store result */       \
+  "sqxtn  v27.8b, v22.8h\n"                           /*  cvt int16 to int8 */ \
+  "sqxtn2 v27.16b, v23.8h\n"                          /*  cvt int16 to int8 */ \
+  "str   q24,  [%[c_ptr1], #32]\n"                                             \
+  "stp   q25, q26,  [%[c_ptr2]]\n"                                             \
+  "str   q27,  [%[c_ptr2], #32]\n"
+
+#define SPARSE_INT8_F32_W32_V8_SEMI2_OUT  \
+  SPARSE_INT8_F32_W32_V8_SEMI2_KERNEL     \
+  SPARSE_INT8_F32_W32_V8_SEMI2_RELU       \
+  SPARSE_INT8_F32_W32_V8_SEMI2_RELU6      \
+  SPARSE_INT8_F32_W32_V8_SEMI2_LEAKY_RELU \
+  SPARSE_INT8_F32_W32_V8_SEMI2_HARD_SWISH \
+  /* store result */                      \
+  "stp   q16, q17,  [%[c_ptr1]]\n"        \
+  "stp   q24, q25,  [%[c_ptr2]]\n"        \
+  "stp   q18, q19,  [%[c_ptr1], #32]\n"   \
+  "stp   q26, q27,  [%[c_ptr2], #32]\n"   \
+  "stp   q20, q21,  [%[c_ptr1], #64]\n"   \
+  "stp   q28, q29,  [%[c_ptr2], #64]\n"   \
+  "stp   q22, q23,  [%[c_ptr1], #96]\n"   \
+  "stp   q30, q31,  [%[c_ptr2], #96]\n"
+
+#define SPARSE_INT8_F32_W16_V8_SEMI2_OUT  \
+  SPARSE_INT8_F32_W16_V8_SEMI2_KERNEL     \
+  SPARSE_INT8_F32_W16_V8_SEMI2_RELU       \
+  SPARSE_INT8_F32_W16_V8_SEMI2_RELU6      \
+  SPARSE_INT8_F32_W16_V8_SEMI2_LEAKY_RELU \
+  SPARSE_INT8_F32_W16_V8_SEMI2_HARD_SWISH \
+  /* store result */                      \
+  "stp   q24, q25,  [%[c_ptr1]]\n"        \
+  "stp   q28, q29,  [%[c_ptr2]]\n"        \
+  "stp   q26, q27,  [%[c_ptr1], #32]\n"   \
+  "stp   q30, q31,  [%[c_ptr2], #32]\n"
+
+#define SPARSE_INT8_F32_W8_V8_SEMI2_OUT  \
+  SPARSE_INT8_F32_W8_V8_SEMI2_KERNEL     \
+  SPARSE_INT8_F32_W8_V8_SEMI2_RELU       \
+  SPARSE_INT8_F32_W8_V8_SEMI2_RELU6      \
+  SPARSE_INT8_F32_W8_V8_SEMI2_LEAKY_RELU \
+  SPARSE_INT8_F32_W8_V8_SEMI2_HARD_SWISH \
+  /* store result */                     \
+  "stp   q28, q29,  [%[c_ptr1]]\n"       \
+  "stp   q30, q31,  [%[c_ptr2]]\n"
+
+#define SPARSE_INT8_F32_W4_V8_SEMI2_OUT  \
+  SPARSE_INT8_F32_W4_V8_SEMI2_KERNEL     \
+  SPARSE_INT8_F32_W4_V8_SEMI2_RELU       \
+  SPARSE_INT8_F32_W4_V8_SEMI2_RELU6      \
+  SPARSE_INT8_F32_W4_V8_SEMI2_LEAKY_RELU \
+  SPARSE_INT8_F32_W4_V8_SEMI2_HARD_SWISH \
+  /* store result */                     \
+  "str   q30,  [%[c_ptr1]]\n"            \
+  "str   q31,  [%[c_ptr2]]\n"
+
+#define SPARSE_INT8_F32_W32_V8_SEMI1_OUT  \
+  SPARSE_INT8_F32_W32_V8_SEMI1_KERNEL     \
+  SPARSE_INT8_F32_W32_V8_SEMI1_RELU       \
+  SPARSE_INT8_F32_W32_V8_SEMI1_RELU6      \
+  SPARSE_INT8_F32_W32_V8_SEMI1_LEAKY_RELU \
+  SPARSE_INT8_F32_W32_V8_SEMI1_HARD_SWISH \
+  /* store result */                      \
+  "stp   q21, q22,  [%[c_ptr]]\n"         \
+  "stp   q23, q24,  [%[c_ptr], #32]\n"    \
+  "stp   q25, q26,  [%[c_ptr], #64]\n"    \
+  "stp   q27, q28,  [%[c_ptr], #96]\n"
+
+#define SPARSE_INT8_F32_W16_V8_SEMI1_OUT  \
+  SPARSE_INT8_F32_W16_V8_SEMI1_KERNEL     \
+  SPARSE_INT8_F32_W16_V8_SEMI1_RELU       \
+  SPARSE_INT8_F32_W16_V8_SEMI1_RELU6      \
+  SPARSE_INT8_F32_W16_V8_SEMI1_LEAKY_RELU \
+  SPARSE_INT8_F32_W16_V8_SEMI1_HARD_SWISH \
+  /* store result */                      \
+  "stp   q21, q22,  [%[c_ptr]]\n"         \
+  "stp   q23, q24,  [%[c_ptr], #32]\n"
+
+#define SPARSE_INT8_F32_W8_V8_SEMI1_OUT  \
+  SPARSE_INT8_F32_W8_V8_SEMI1_KERNEL     \
+  SPARSE_INT8_F32_W8_V8_SEMI1_RELU       \
+  SPARSE_INT8_F32_W8_V8_SEMI1_RELU6      \
+  SPARSE_INT8_F32_W8_V8_SEMI1_LEAKY_RELU \
+  SPARSE_INT8_F32_W8_V8_SEMI1_HARD_SWISH \
+  /* store result */                     \
+  "stp   q21, q22,  [%[c_ptr]]\n"
+
+#define SPARSE_INT8_F32_W4_V8_SEMI1_OUT  \
+  SPARSE_INT8_F32_W4_V8_SEMI1_KERNEL     \
+  SPARSE_INT8_F32_W4_V8_SEMI1_RELU       \
+  SPARSE_INT8_F32_W4_V8_SEMI1_RELU6      \
+  SPARSE_INT8_F32_W4_V8_SEMI1_LEAKY_RELU \
+  SPARSE_INT8_F32_W4_V8_SEMI1_HARD_SWISH \
+  /* store result */                     \
+  "str   q21,  [%[c_ptr]]\n"
+
+#define SET_ASSM_INPUT_PARAM1_V8_INT8_F32 \
+: [a_ptr] "+r"(cur_w),                      \
+  [b_ptr] "+r"(cur_b),                      \
+  [c_ptr1] "+r"(out_ptr1),                  \
+  [c_ptr2] "+r"(out_ptr2),                  \
+  [k] "+r"(nnz),                            \
+  [widx_dmap] "+r"(dmap),                   \
+  [bias_ptr] "+r"(pbias),                   \
+  [scale_ptr] "+r"(pscale)                  \
+: [vflag_act] "r"(flag_act),                \
+  [valpha] "r"(alpha),                      \
+  [hs_param] "r"(hs_param)
+
+#define SET_ASSM_INPUT_PARAM2_V8_INT8_F32 \
+: [a_ptr] "+r"(cur_w),                      \
+  [b_ptr] "+r"(cur_b),                      \
+  [c_ptr] "+r"(out_ptr),                    \
+  [k] "+r"(nnz),                            \
+  [widx_dmap] "+r"(dmap)                    \
+: [vscale] "r"(vscale),                     \
+  [vbias] "r"(vbias),                       \
+  [vflag_act] "r"(flag_act),                \
+  [valpha] "r"(alpha),                      \
+  [hs_param] "r"(hs_param)
+
+/**
+ * \brief Semi-structured Sparse calculation implementation of 1x1 convolution,
+ * the input-output
+ * type is int8-f32.
+ * Semi-structured Sparse matrix multiplication is calculated in blocks, the
+ * block size is Mx32,
+ * that is,
+ * the parameter matrix is MxK, and the activation matrix is Kx48; when N is
+ * less than 32,
+ * it is calculated in blocks of Mx16, Mx8, and Mx4 in turn;
+ * @param A Semi-structured Sparse weight data
+ * @param B dense input data
+ * @param widx_dmap An array of int32_t values storing scaled [by sizeof(input
+ * element)] difference
+ * between input channels corresponding to successive non-zero element
+ * @param nidx_nnzmap the number of non-zero kernel elements per each output
+ * channel
+ * @param bias
+ * @param output
+ * @param M
+ * @param N
+ * @param K
+ * @param param
+ * @param ctx
+ */
+void sparse_semi_conv_int8_fp32_pipelined(
+    const int8_t* A,
+    const int8_t* B,
+    const int32_t* widx_dmap,
+    const uint32_t* nidx_nnzmap,
+    const float* bias,
+    const float* scale,
+    float* output,
+    int M,
+    int K,
+    int N,
+    const operators::SparseConvParam& param,
+    ARMContext* ctx) {
+  auto act_param = param.activation_param;
+  auto act_type = act_param.active_type;
+  volatile float alpha = 0.f;
+  float hs_param[12] = {0.f};
+  int flag_act = 0x00;  // relu: 1, relu6: 2, leakey: 3
+  if (act_param.has_active) {
+    if (act_type == lite_api::ActivationType::kRelu) {
+      flag_act = 0x01;
+    } else if (act_type == lite_api::ActivationType::kRelu6) {
+      flag_act = 0x02;
+      alpha = act_param.Relu_clipped_coef;
+    } else if (act_type == lite_api::ActivationType::kLeakyRelu) {
+      flag_act = 0x03;
+      alpha = act_param.Leaky_relu_alpha;
+    } else if (act_type == lite_api::ActivationType::kHardSwish) {
+      flag_act = 0x04;
+      for (int i = 0; i < 4; i++) {
+        hs_param[i] = act_param.hard_swish_offset;
+        hs_param[i + 4] = 1.0 / act_param.hard_swish_scale;
+        hs_param[i + 8] = act_param.hard_swish_threshold;
+      }
+    }
+  }
+  int flag_bias = (bias != nullptr) ? 1 : 0;
+  size_t mc = N * sizeof(int8_t);
+  size_t nc = M;
+  size_t output_stride = N * sizeof(float);
+  size_t output_decrement = output_stride * nc - 32 * sizeof(float);
+  size_t pair_num = M / 2;
+  size_t lave_num = M % 2;
+  float bias_zero[2] = {0.f, 0.f};
+  while
+    SPARSE_LIKELY(mc >= 32 * sizeof(int8_t)) {
+      LITE_PARALLEL_COMMON_BEGIN(i, tid, pair_num, 0, 1) {
+        GET_SEMI_PARAM_TABLE(int8_t, float)
+        const float* pscale = scale + i * 2;
+        // clang-format off
+        asm volatile(SPARSE_INT8_F32_W32_V8_SEMI2_OUT
+          SET_ASSM_INPUT_PARAM1_V8_INT8_F32
+          : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+              "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
+              "v16", "v17", "v18", "v19", "v21", "v22", "v23", "v24", "v25", 
+              "v26", "v27", "v28", "v29", "v30", "v31", "w1", "x1", "cc", "memory");
+        // clang-format on
+      }
+      LITE_PARALLEL_COMMON_END();
+      if
+        SPARSE_UNLIKELY(lave_num != 0) {
+          GET_UNSTRUCT_PARAM_TABLE(int8_t, float)
+          float vscale = scale[pair_num * 2];
+          // clang-format off
+        asm volatile(SPARSE_INT8_F32_W32_V8_SEMI1_OUT
+          SET_ASSM_INPUT_PARAM2_V8_INT8_F32
+          : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+              "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
+              "v16", "v17", "v18", "v19", "v21", "v22", "v23", "v24", "v25", 
+              "v26", "v27", "v28", "v30", "v31", "w0", "w1", "x1", "cc", "memory");
+          // clang-format on
+        }
+      output = reinterpret_cast<float*>((uintptr_t)output + 32 * sizeof(float));
+      B += 32;
+      mc -= 32 * sizeof(int8_t);
+    }
+  if
+    SPARSE_UNLIKELY(mc != 0) {
+      output_decrement += 16 * sizeof(float);
+      if (mc & (16 * sizeof(int8_t))) {
+        LITE_PARALLEL_COMMON_BEGIN(i, tid, pair_num, 0, 1) {
+          GET_SEMI_PARAM_TABLE(int8_t, float)
+          const float* pscale = scale + i * 2;
+          // clang-format off
+          asm volatile(SPARSE_INT8_F32_W16_V8_SEMI2_OUT
+            SET_ASSM_INPUT_PARAM1_V8_INT8_F32
+            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+              "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
+              "v16", "v17", "v18", "v19", "v24", "v25", "v26", "v27", "v28","v29", 
+              "v30", "v31", "w1", "x1", "cc", "memory");
+          // clang-format on
+        }
+        LITE_PARALLEL_COMMON_END();
+        if
+          SPARSE_UNLIKELY(lave_num != 0) {
+            GET_UNSTRUCT_PARAM_TABLE(int8_t, float)
+            float vscale = scale[pair_num * 2];
+            // clang-format off
+          asm volatile(SPARSE_INT8_F32_W16_V8_SEMI1_OUT
+            SET_ASSM_INPUT_PARAM2_V8_INT8_F32
+            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+              "v8", "v9", "v11", "v12", "v13", "v14", "v16", "v17", "v21", "v22", "v23",
+              "v24", "w1", "x1", "cc", "memory");
+            // clang-format on
+          }
+        output =
+            reinterpret_cast<float*>((uintptr_t)output + 16 * sizeof(float));
+        B += 16;
+        mc -= 16 * sizeof(int8_t);
+      }
+      output_decrement += 8 * sizeof(float);
+      if (mc & (8 * sizeof(int8_t))) {
+        LITE_PARALLEL_COMMON_BEGIN(i, tid, pair_num, 0, 1) {
+          GET_SEMI_PARAM_TABLE(int8_t, float)
+          const float* pscale = scale + i * 2;
+          // clang-format off
+          asm volatile(SPARSE_INT8_F32_W8_V8_SEMI2_OUT
+            SET_ASSM_INPUT_PARAM1_V8_INT8_F32
+            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+                  "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
+                  "v16", "v17", "v28", "v29", "v30", "v31", "w1", "x1", "cc", "memory");
+          // clang-format on
+        }
+        LITE_PARALLEL_COMMON_END();
+        if
+          SPARSE_UNLIKELY(lave_num != 0) {
+            GET_UNSTRUCT_PARAM_TABLE(int8_t, float)
+            float vscale = scale[pair_num * 2];
+            // clang-format off
+          asm volatile(SPARSE_INT8_F32_W8_V8_SEMI1_OUT
+            SET_ASSM_INPUT_PARAM2_V8_INT8_F32
+            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v16", "v21", 
+              "v22", "w1", "x1", "cc", "memory");
+            // clang-format on
+          }
+        output =
+            reinterpret_cast<float*>((uintptr_t)output + 8 * sizeof(float));
+        B += 8;
+        mc -= 8 * sizeof(int8_t);
+      }
+      output_decrement += 4 * sizeof(float);
+      if (mc & (4 * sizeof(int8_t))) {
+        LITE_PARALLEL_COMMON_BEGIN(i, tid, pair_num, 0, 1) {
+          GET_SEMI_PARAM_TABLE(int8_t, float)
+          const float* pscale = scale + i * 2;
+          // clang-format off
+          asm volatile(SPARSE_INT8_F32_W4_V8_SEMI2_OUT
+            SET_ASSM_INPUT_PARAM1_V8_INT8_F32
+            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+                  "v8", "v9", "v11", "v12", "v13", "v14", "v15",
+                  "v16", "v17", "v30", "v31", "w1", "w2", "w3", "w4", "w5", "x1", "cc", "memory");
+          // clang-format on
+        }
+        LITE_PARALLEL_COMMON_END();
+        if
+          SPARSE_UNLIKELY(lave_num != 0) {
+            GET_UNSTRUCT_PARAM_TABLE(int8_t, float)
+            float vscale = scale[pair_num * 2];
+            // clang-format off
+          asm volatile(SPARSE_INT8_F32_W4_V8_SEMI1_OUT
+            SET_ASSM_INPUT_PARAM2_V8_INT8_F32
+            : "v0", "v1", "v2", "v3", "v4", "v6", "v7", "v8", "v9", "v11", "v16", "v21", 
+              "w1", "w2", "w3", "w4", "w5", "x1", "cc", "memory");
+            // clang-format on
+          }
+        output =
+            reinterpret_cast<float*>((uintptr_t)output + 4 * sizeof(float));
+        B += 4;
+        mc -= 4 * sizeof(int8_t);
+      }
+
+      if
+        SPARSE_UNLIKELY(mc != 0 && mc < 4 * sizeof(int8_t)) {
+          int mindex = mc / sizeof(int8_t);
+
+          LITE_PARALLEL_COMMON_BEGIN(i, tid, pair_num, 0, 1) {
+            float* out_ptr1 = reinterpret_cast<float*>((uintptr_t)output +
+                                                       output_stride * 2 * i);
+            float* out_ptr2 = reinterpret_cast<float*>(
+                (uintptr_t)output + output_stride * (2 * i + 1));
+            const int8_t* cur_w = A;
+            uint32_t nnz = nidx_nnzmap[i];
+            const int8_t* cur_b = B;
+            const int32_t* dmap = widx_dmap;
+            if (i != 0) {
+              cur_w = A + nidx_nnzmap[i - 1] * 2;
+              nnz = nidx_nnzmap[i] - nidx_nnzmap[i - 1];
+              cur_b +=
+                  ((nidx_nnzmap[i - 1] == 0)
+                       ? 0
+                       : widx_dmap[nidx_nnzmap[i - 1] - 1] / sizeof(int8_t));
+              dmap = widx_dmap + nidx_nnzmap[i - 1];
+            }
+            float32x4_t vbias1, vbias2;
+            if (bias != nullptr) {
+              vbias1 = vdupq_n_f32(bias[i * 2]);
+              vbias2 = vdupq_n_f32(bias[i * 2 + 1]);
+            } else {
+              vbias1 = vdupq_n_f32(0);
+              vbias2 = vdupq_n_f32(0);
+            }
+            const float* pscale = scale + i * 2;
+            int32x4_t vacc0123_1 = vdupq_n_s32(0);
+            int32x4_t vacc0123_2 = vdupq_n_s32(0);
+            for (int j = 0; j < nnz; j++) {
+              int8x8_t vi0123 = vdup_n_s8(0);
+              if (mc == 1) {
+                vi0123 = vld1_lane_s8(cur_b, vi0123, 0);
+              } else if (mc == 2) {
+                vi0123 = vld1_lane_s8(cur_b, vi0123, 0);
+                vi0123 = vld1_lane_s8(cur_b + 1, vi0123, 1);
+              } else {
+                vi0123 = vld1_lane_s8(cur_b, vi0123, 0);
+                vi0123 = vld1_lane_s8(cur_b + 1, vi0123, 1);
+                vi0123 = vld1_lane_s8(cur_b + 2, vi0123, 2);
+              }
+              int8x8_t vw1 = vld1_dup_s8(cur_w);
+              cur_w += 1;
+              int8x8_t vw2 = vld1_dup_s8(cur_w);
+              cur_w += 1;
+              intptr_t diff = *dmap++;
+              cur_b = (const int8_t*)((uintptr_t)cur_b + (uintptr_t)diff);
+              vacc0123_1 =
+                  vaddw_s16(vacc0123_1, vget_low_s16(vmull_s8(vi0123, vw1)));
+              vacc0123_2 =
+                  vaddw_s16(vacc0123_2, vget_low_s16(vmull_s8(vi0123, vw2)));
+            }
+            float32x4_t vaccf0123_1 = vcvtq_f32_s32(vacc0123_1);
+            float32x4_t vaccf0123_2 = vcvtq_f32_s32(vacc0123_2);
+            vaccf0123_1 = vmlaq_n_f32(vbias1, vaccf0123_1, pscale[0]);
+            vaccf0123_2 = vmlaq_n_f32(vbias2, vaccf0123_2, pscale[1]);
+            float32x4_t vzero = vdupq_n_f32(0);
+            if (flag_act == 1) {
+              vaccf0123_1 = vmaxq_f32(vaccf0123_1, vzero);
+              vaccf0123_2 = vmaxq_f32(vaccf0123_2, vzero);
+            } else if (flag_act == 0) {
+            } else if (flag_act == 2) {
+              float32x4_t aph = vdupq_n_f32(alpha);
+              vaccf0123_1 = vmaxq_f32(vaccf0123_1, vzero);
+              vaccf0123_1 = vminq_f32(vaccf0123_1, aph);
+              vaccf0123_2 = vmaxq_f32(vaccf0123_2, vzero);
+              vaccf0123_2 = vminq_f32(vaccf0123_2, aph);
+            } else if (flag_act == 3) {
+              float32x4_t aph = vdupq_n_f32(alpha);
+              uint32x4_t vflag0123_1 = vcgeq_f32(vaccf0123_1, vzero);
+              float32x4_t v0123_1 = vmulq_f32(vaccf0123_1, aph);
+              vaccf0123_1 = vbslq_f32(vflag0123_1, vaccf0123_1, v0123_1);
+              uint32x4_t vflag0123_2 = vcgeq_f32(vaccf0123_2, vzero);
+              float32x4_t v0123_2 = vmulq_f32(vaccf0123_2, aph);
+              vaccf0123_2 = vbslq_f32(vflag0123_2, vaccf0123_2, v0123_2);
+            } else {
+              float32x4_t offset = vdupq_n_f32(act_param.hard_swish_offset);
+              float32x4_t hs_scale =
+                  vdupq_n_f32(1.0 / act_param.hard_swish_scale);
+              float32x4_t thre = vdupq_n_f32(act_param.hard_swish_threshold);
+              float32x4_t vset0123_1 = vaddq_f32(vaccf0123_1, offset);
+              float32x4_t vscale0123_1 = vmulq_f32(vaccf0123_1, hs_scale);
+              vset0123_1 = vmaxq_f32(vset0123_1, vzero);
+              vset0123_1 = vminq_f32(vset0123_1, thre);
+              vaccf0123_1 = vmulq_f32(vscale0123_1, vset0123_1);
+              float32x4_t vset0123_2 = vaddq_f32(vaccf0123_2, offset);
+              float32x4_t vscale0123_2 = vmulq_f32(vaccf0123_2, hs_scale);
+              vset0123_2 = vmaxq_f32(vset0123_2, vzero);
+              vset0123_2 = vminq_f32(vset0123_2, thre);
+              vaccf0123_2 = vmulq_f32(vscale0123_2, vset0123_2);
+            }
+
+            if (mc == 1) {
+              vst1q_lane_f32(out_ptr1, vaccf0123_1, 0);
+              vst1q_lane_f32(out_ptr2, vaccf0123_2, 0);
+            } else if (mc == 2) {
+              vst1q_lane_f32(out_ptr1, vaccf0123_1, 0);
+              vst1q_lane_f32(out_ptr2, vaccf0123_2, 0);
+              vst1q_lane_f32(out_ptr1 + 1, vaccf0123_1, 1);
+              vst1q_lane_f32(out_ptr2 + 1, vaccf0123_2, 1);
+            } else {
+              vst1q_lane_f32(out_ptr1, vaccf0123_1, 0);
+              vst1q_lane_f32(out_ptr2, vaccf0123_2, 0);
+              vst1q_lane_f32(out_ptr1 + 1, vaccf0123_1, 1);
+              vst1q_lane_f32(out_ptr2 + 1, vaccf0123_2, 1);
+              vst1q_lane_f32(out_ptr1 + 2, vaccf0123_1, 2);
+              vst1q_lane_f32(out_ptr2 + 2, vaccf0123_2, 2);
+            }
+          }
+          LITE_PARALLEL_COMMON_END();
+          if
+            SPARSE_UNLIKELY(lave_num != 0) {
+              float* out_ptr = reinterpret_cast<float*>(
+                  (uintptr_t)output + output_stride * 2 * pair_num);
+              const int8_t* cur_w = A;
+              uint32_t nnz = nidx_nnzmap[pair_num];
+              const int8_t* cur_b = B;
+              const int32_t* dmap = widx_dmap;
+              if (pair_num != 0) {
+                cur_w = A + nidx_nnzmap[pair_num - 1] * 2;
+                nnz = nidx_nnzmap[pair_num] - nidx_nnzmap[pair_num - 1];
+                cur_b += ((nidx_nnzmap[pair_num - 1] == 0)
+                              ? 0
+                              : widx_dmap[nidx_nnzmap[pair_num - 1] - 1] /
+                                    sizeof(int8_t));
+                dmap = widx_dmap + nidx_nnzmap[pair_num - 1];
+              }
+              float32x4_t vbias = (bias != nullptr)
+                                      ? vdupq_n_f32(bias[pair_num * 2])
+                                      : vdupq_n_f32(0);
+              float vscale = scale[pair_num * 2];
+              int32x4_t vacc0123 = vdupq_n_s32(0);
+
+              for (int j = 0; j < nnz; j++) {
+                int8x8_t vi0123 = vdup_n_s8(0);
+                if (mc == 1) {
+                  vi0123 = vld1_lane_s8(cur_b, vi0123, 0);
+                } else if (mc == 2) {
+                  vi0123 = vld1_lane_s8(cur_b, vi0123, 0);
+                  vi0123 = vld1_lane_s8(cur_b + 1, vi0123, 1);
+                } else {
+                  vi0123 = vld1_lane_s8(cur_b, vi0123, 0);
+                  vi0123 = vld1_lane_s8(cur_b + 1, vi0123, 1);
+                  vi0123 = vld1_lane_s8(cur_b + 2, vi0123, 2);
+                }
+                int8x8_t vw = vld1_dup_s8(cur_w);
+                cur_w += 1;
+                intptr_t diff = *dmap++;
+                cur_b = (const int8_t*)((uintptr_t)cur_b + (uintptr_t)diff);
+                int16x8_t vo0123 = vmull_s8(vi0123, vw);
+                vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vo0123));
+              }
+              float32x4_t vaccf0123 = vcvtq_f32_s32(vacc0123);
+              vaccf0123 = vmlaq_n_f32(vbias, vaccf0123, vscale);
+              float32x4_t vzero = vdupq_n_f32(0);
+              if (flag_act == 1) {
+                vaccf0123 = vmaxq_f32(vaccf0123, vzero);
+              } else if (flag_act == 0) {
+              } else if (flag_act == 2) {
+                float32x4_t aph = vdupq_n_f32(alpha);
+                vaccf0123 = vmaxq_f32(vaccf0123, vzero);
+                vaccf0123 = vminq_f32(vaccf0123, aph);
+              } else if (flag_act == 3) {
+                float32x4_t aph = vdupq_n_f32(alpha);
+                uint32x4_t vflag0123 = vcgeq_f32(vaccf0123, vzero);
+                float32x4_t v0123 = vmulq_f32(vaccf0123, aph);
+                vaccf0123 = vbslq_f32(vflag0123, vaccf0123, v0123);
+              } else {
+                float32x4_t offset = vdupq_n_f32(act_param.hard_swish_offset);
+                float32x4_t hs_scale =
+                    vdupq_n_f32(1.0 / act_param.hard_swish_scale);
+                float32x4_t thre = vdupq_n_f32(act_param.hard_swish_threshold);
+                float32x4_t vset0123 = vaddq_f32(vaccf0123, offset);
+                float32x4_t vscale0123 = vmulq_f32(vaccf0123, hs_scale);
+                vset0123 = vmaxq_f32(vset0123, vzero);
+                vset0123 = vminq_f32(vset0123, thre);
+                vaccf0123 = vmulq_f32(vscale0123, vset0123);
+              }
+
+              if (mc == 1) {
+                vst1q_lane_f32(out_ptr, vaccf0123, 0);
+              } else if (mc == 2) {
+                vst1q_lane_f32(out_ptr, vaccf0123, 0);
+                vst1q_lane_f32(out_ptr + 1, vaccf0123, 1);
+              } else {
+                vst1q_lane_f32(out_ptr, vaccf0123, 0);
+                vst1q_lane_f32(out_ptr + 1, vaccf0123, 1);
+                vst1q_lane_f32(out_ptr + 2, vaccf0123, 2);
+              }
+            }
+        }
+    }
+}
+
+#define SPARSE_INT8_INT8_W48_V8_SEMI_KERNEL      \
+  "eor v16.16b, v0.16b,  v0.16b\n"               \
+  "eor v17.16b, v1.16b,  v1.16b\n"               \
+  "eor v18.16b, v2.16b,  v2.16b\n"               \
+  "eor v19.16b, v3.16b,  v3.16b\n"               \
+  "prfm  pldl1keep, [%[a_ptr], #128]\n"          \
+  "eor v20.16b, v4.16b,  v4.16b\n"               \
+  "eor v21.16b, v5.16b,  v5.16b\n"               \
+  "eor v22.16b, v6.16b,  v6.16b\n"               \
+  "prfm  pldl1keep, [%[widx_dmap], #128]\n"      \
+  "eor v23.16b, v7.16b,  v7.16b\n"               \
+  "eor v24.16b, v8.16b,  v8.16b\n"               \
+  "eor v25.16b, v9.16b,  v9.16b\n"               \
+  "eor v26.16b, v10.16b, v10.16b\n"              \
+  "prfm  pldl1keep, [%[b_ptr], #128]\n"          \
+  "eor v27.16b, v11.16b, v11.16b\n"              \
+  "eor v28.16b, v12.16b, v12.16b\n"              \
+  "eor v29.16b, v13.16b, v13.16b\n"              \
+  "eor v30.16b, v14.16b, v14.16b\n"              \
+  "eor v31.16b, v15.16b, v15.16b\n"              \
+  "eor v8.16b,  v0.16b,  v0.16b\n"               \
+  "eor v9.16b,  v1.16b,  v1.16b\n"               \
+  "eor v10.16b, v2.16b,  v2.16b\n"               \
+  "eor v11.16b, v3.16b,  v3.16b\n"               \
+  "eor v12.16b, v4.16b,  v4.16b\n"               \
+  "eor v13.16b, v5.16b,  v5.16b\n"               \
+  "eor v14.16b, v6.16b,  v6.16b\n"               \
+  "eor v15.16b, v7.16b,  v7.16b\n"               \
+  "cbz    %w[k],    1f\n"                        \
+  "0:\n"                                         \
+  "ld1r  {v0.16b}, [%[a_ptr]], #1\n"             \
+  "ld1r  {v1.16b}, [%[a_ptr]], #1\n"             \
+  "ldr   w1, [%[widx_dmap]],   #4\n"             \
+  "sxtw  x1,  w1\n"                              \
+  "ld1   {v2.16b, v3.16b, v4.16b}, [%[b_ptr]]\n" \
+  "subs    %w[k],   %w[k],   #1\n"               \
+  "smull   v5.8h,   v0.8b,   v2.8b\n"            \
+  "smull2  v6.8h,   v0.16b,  v2.16b\n"           \
+  "smull   v7.8h,   v0.8b,   v3.8b\n"            \
+  "saddw   v8.4s,   v8.4s,   v5.4h\n"            \
+  "saddw2  v9.4s,   v9.4s,   v5.8h\n"            \
+  "saddw   v10.4s,  v10.4s,  v6.4h\n"            \
+  "saddw2  v11.4s,  v11.4s,  v6.8h\n"            \
+  "saddw   v12.4s,  v12.4s,  v7.4h\n"            \
+  "saddw2  v13.4s,  v13.4s,  v7.8h\n"            \
+  "smull2  v5.8h,   v0.16b,  v3.16b\n"           \
+  "smull   v6.8h,   v0.8b,   v4.8b\n"            \
+  "smull2  v7.8h,   v0.16b,  v4.16b\n"           \
+  "add   %[b_ptr],  %[b_ptr], x1\n"              \
+  "saddw   v14.4s,  v14.4s,  v5.4h\n"            \
+  "saddw2  v15.4s,  v15.4s,  v5.8h\n"            \
+  "saddw   v16.4s,  v16.4s,  v6.4h\n"            \
+  "saddw2  v17.4s,  v17.4s,  v6.8h\n"            \
+  "saddw   v18.4s,  v18.4s,  v7.4h\n"            \
+  "saddw2  v19.4s,  v19.4s,  v7.8h\n"            \
+  "smull   v0.8h,   v1.8b,   v2.8b\n"            \
+  "smull2  v5.8h,   v1.16b,  v2.16b\n"           \
+  "smull   v6.8h,   v1.8b,   v3.8b\n"            \
+  "smull2  v7.8h,   v1.16b,  v3.16b\n"           \
+  "saddw   v20.4s,  v20.4s,  v0.4h\n"            \
+  "saddw2  v21.4s,  v21.4s,  v0.8h\n"            \
+  "prfm  pldl1keep, [%[b_ptr], #64]\n"           \
+  "saddw   v22.4s,  v22.4s,  v5.4h\n"            \
+  "saddw2  v23.4s,  v23.4s,  v5.8h\n"            \
+  "saddw   v24.4s,  v24.4s,  v6.4h\n"            \
+  "saddw2  v25.4s,  v25.4s,  v6.8h\n"            \
+  "saddw   v26.4s,  v26.4s,  v7.4h\n"            \
+  "saddw2  v27.4s,  v27.4s,  v7.8h\n"            \
+  "smull   v0.8h,   v1.8b,   v4.8b\n"            \
+  "smull2  v5.8h,   v1.16b,  v4.16b\n"           \
+  "saddw   v28.4s,  v28.4s,  v0.4h\n"            \
+  "saddw2  v29.4s,  v29.4s,  v0.8h\n"            \
+  "saddw   v30.4s,  v30.4s,  v5.4h\n"            \
+  "saddw2  v31.4s,  v31.4s,  v5.8h\n"            \
+  "bne     0b\n"                                 \
+  "1:\n"                                         \
+  "ld1     {v0.2s},   [%[bias_ptr]]\n"           \
+  "ld1     {v1.2s},   [%[scale_ptr]]\n"          \
+  "scvtf   v2.4s,   v8.4s\n"                     \
+  "scvtf   v3.4s,   v9.4s\n"                     \
+  "scvtf   v4.4s,   v10.4s\n"                    \
+  "scvtf   v5.4s,   v11.4s\n"                    \
+  "scvtf   v6.4s,   v12.4s\n"                    \
+  "scvtf   v7.4s,   v13.4s\n"                    \
+  "dup     v8.4s,   v0.s[0]\n"                   \
+  "dup     v9.4s,   v0.s[0]\n"                   \
+  "dup     v10.4s,  v0.s[0]\n"                   \
+  "dup     v11.4s,  v0.s[0]\n"                   \
+  "dup     v12.4s,  v0.s[0]\n"                   \
+  "dup     v13.4s,  v0.s[0]\n"                   \
+  "fmla    v8.4s,   v2.4s,   v1.s[0]\n"          \
+  "fmla    v9.4s,   v3.4s,   v1.s[0]\n"          \
+  "fmla    v10.4s,  v4.4s,   v1.s[0]\n"          \
+  "fmla    v11.4s,  v5.4s,   v1.s[0]\n"          \
+  "fmla    v12.4s,  v6.4s,   v1.s[0]\n"          \
+  "fmla    v13.4s,  v7.4s,   v1.s[0]\n"          \
+  "scvtf   v2.4s,   v14.4s\n"                    \
+  "scvtf   v3.4s,   v15.4s\n"                    \
+  "scvtf   v4.4s,   v16.4s\n"                    \
+  "scvtf   v5.4s,   v17.4s\n"                    \
+  "scvtf   v6.4s,   v18.4s\n"                    \
+  "scvtf   v7.4s,   v19.4s\n"                    \
+  "dup     v14.4s,  v0.s[0]\n"                   \
+  "dup     v15.4s,  v0.s[0]\n"                   \
+  "dup     v16.4s,  v0.s[0]\n"                   \
+  "dup     v17.4s,  v0.s[0]\n"                   \
+  "dup     v18.4s,  v0.s[0]\n"                   \
+  "dup     v19.4s,  v0.s[0]\n"                   \
+  "fmla    v14.4s,  v2.4s,   v1.s[0]\n"          \
+  "fmla    v15.4s,  v3.4s,   v1.s[0]\n"          \
+  "fmla    v16.4s,  v4.4s,   v1.s[0]\n"          \
+  "fmla    v17.4s,  v5.4s,   v1.s[0]\n"          \
+  "fmla    v18.4s,  v6.4s,   v1.s[0]\n"          \
+  "fmla    v19.4s,  v7.4s,   v1.s[0]\n"          \
+  "scvtf   v2.4s,   v20.4s\n"                    \
+  "scvtf   v3.4s,   v21.4s\n"                    \
+  "scvtf   v4.4s,   v22.4s\n"                    \
+  "scvtf   v5.4s,   v23.4s\n"                    \
+  "scvtf   v6.4s,   v24.4s\n"                    \
+  "scvtf   v7.4s,   v25.4s\n"                    \
+  "dup     v20.4s,  v0.s[1]\n"                   \
+  "dup     v21.4s,  v0.s[1]\n"                   \
+  "dup     v22.4s,  v0.s[1]\n"                   \
+  "dup     v23.4s,  v0.s[1]\n"                   \
+  "dup     v24.4s,  v0.s[1]\n"                   \
+  "dup     v25.4s,  v0.s[1]\n"                   \
+  "fmla    v20.4s,  v2.4s,   v1.s[1]\n"          \
+  "fmla    v21.4s,  v3.4s,   v1.s[1]\n"          \
+  "fmla    v22.4s,  v4.4s,   v1.s[1]\n"          \
+  "fmla    v23.4s,  v5.4s,   v1.s[1]\n"          \
+  "fmla    v24.4s,  v6.4s,   v1.s[1]\n"          \
+  "fmla    v25.4s,  v7.4s,   v1.s[1]\n"          \
+  "scvtf   v2.4s,   v26.4s\n"                    \
+  "scvtf   v3.4s,   v27.4s\n"                    \
+  "scvtf   v4.4s,   v28.4s\n"                    \
+  "scvtf   v5.4s,   v29.4s\n"                    \
+  "scvtf   v6.4s,   v30.4s\n"                    \
+  "scvtf   v7.4s,   v31.4s\n"                    \
+  "dup     v26.4s,  v0.s[1]\n"                   \
+  "dup     v27.4s,  v0.s[1]\n"                   \
+  "dup     v28.4s,  v0.s[1]\n"                   \
+  "dup     v29.4s,  v0.s[1]\n"                   \
+  "dup     v30.4s,  v0.s[1]\n"                   \
+  "dup     v31.4s,  v0.s[1]\n"                   \
+  "fmla    v26.4s,  v2.4s,   v1.s[1]\n"          \
+  "fmla    v27.4s,  v3.4s,   v1.s[1]\n"          \
+  "fmla    v28.4s,  v4.4s,   v1.s[1]\n"          \
+  "fmla    v29.4s,  v5.4s,   v1.s[1]\n"          \
+  "fmla    v30.4s,  v6.4s,   v1.s[1]\n"          \
+  "fmla    v31.4s,  v7.4s,   v1.s[1]\n"
+
+#define SPARSE_INT8_INT8_W32_V8_SEMI2_KERNEL \
+  "eor v16.16b, v0.16b,  v0.16b\n"           \
+  "eor v17.16b, v1.16b,  v1.16b\n"           \
+  "eor v18.16b, v2.16b,  v2.16b\n"           \
+  "eor v19.16b, v3.16b,  v3.16b\n"           \
+  "prfm  pldl1keep, [%[a_ptr], #128]\n"      \
+  "eor v20.16b, v4.16b,  v4.16b\n"           \
+  "eor v21.16b, v5.16b,  v5.16b\n"           \
+  "eor v22.16b, v6.16b,  v6.16b\n"           \
+  "prfm  pldl1keep, [%[widx_dmap], #128]\n"  \
+  "eor v23.16b, v7.16b,  v7.16b\n"           \
+  "eor v24.16b, v8.16b,  v8.16b\n"           \
+  "eor v25.16b, v9.16b,  v9.16b\n"           \
+  "eor v26.16b, v10.16b, v10.16b\n"          \
+  "prfm  pldl1keep, [%[b_ptr], #128]\n"      \
+  "eor v27.16b, v11.16b, v11.16b\n"          \
+  "eor v28.16b, v12.16b, v12.16b\n"          \
+  "eor v29.16b, v13.16b, v13.16b\n"          \
+  "eor v30.16b, v14.16b, v14.16b\n"          \
+  "eor v31.16b, v15.16b, v15.16b\n"          \
+  "cbz    %w[k],    1f\n"                    \
+  "0:\n"                                     \
+  "ld1r  {v1.16b}, [%[a_ptr]], #1\n"         \
+  "ld1r  {v2.16b}, [%[a_ptr]], #1\n"         \
+  "ldr   w1, [%[widx_dmap]],   #4\n"         \
+  "sxtw  x1,  w1\n"                          \
+  "ld1   {v3.16b, v4.16b}, [%[b_ptr]]\n"     \
+  "subs    %w[k],   %w[k],   #1\n"           \
+  "smull   v5.8h,   v1.8b,   v3.8b\n"        \
+  "smull2  v6.8h,   v1.16b,  v3.16b\n"       \
+  "smull   v7.8h,   v1.8b,   v4.8b\n"        \
+  "smull2  v8.8h,   v1.16b,  v4.16b\n"       \
+  "smull   v9.8h,   v2.8b,   v3.8b\n"        \
+  "smull2  v10.8h,  v2.16b,  v3.16b\n"       \
+  "smull   v11.8h,  v2.8b,   v4.8b\n"        \
+  "smull2  v12.8h,  v2.16b,  v4.16b\n"       \
+  "add   %[b_ptr],  %[b_ptr], x1\n"          \
+  "saddw   v16.4s,  v16.4s,  v5.4h\n"        \
+  "saddw2  v17.4s,  v17.4s,  v5.8h\n"        \
+  "saddw   v18.4s,  v18.4s,  v6.4h\n"        \
+  "saddw2  v19.4s,  v19.4s,  v6.8h\n"        \
+  "saddw   v20.4s,  v20.4s,  v7.4h\n"        \
+  "saddw2  v21.4s,  v21.4s,  v7.8h\n"        \
+  "prfm  pldl1keep, [%[b_ptr], #64]\n"       \
+  "saddw   v22.4s,  v22.4s,  v8.4h\n"        \
+  "saddw2  v23.4s,  v23.4s,  v8.8h\n"        \
+  "saddw   v24.4s,  v24.4s,  v9.4h\n"        \
+  "saddw2  v25.4s,  v25.4s,  v9.8h\n"        \
+  "saddw   v26.4s,  v26.4s,  v10.4h\n"       \
+  "saddw2  v27.4s,  v27.4s,  v10.8h\n"       \
+  "saddw   v28.4s,  v28.4s,  v11.4h\n"       \
+  "saddw2  v29.4s,  v29.4s,  v11.8h\n"       \
+  "saddw   v30.4s,  v30.4s,  v12.4h\n"       \
+  "saddw2  v31.4s,  v31.4s,  v12.8h\n"       \
+  "bne     0b\n"                             \
+  "1:\n"                                     \
+  "scvtf   v0.4s,   v16.4s\n"                \
+  "scvtf   v1.4s,   v17.4s\n"                \
+  "scvtf   v2.4s,   v18.4s\n"                \
+  "scvtf   v3.4s,   v19.4s\n"                \
+  "scvtf   v4.4s,   v20.4s\n"                \
+  "scvtf   v5.4s,   v21.4s\n"                \
+  "scvtf   v6.4s,   v22.4s\n"                \
+  "scvtf   v7.4s,   v23.4s\n"                \
+  "scvtf   v8.4s,   v24.4s\n"                \
+  "scvtf   v9.4s,   v25.4s\n"                \
+  "scvtf   v10.4s,  v26.4s\n"                \
+  "scvtf   v11.4s,  v27.4s\n"                \
+  "scvtf   v12.4s,  v28.4s\n"                \
+  "scvtf   v13.4s,  v29.4s\n"                \
+  "scvtf   v14.4s,  v30.4s\n"                \
+  "scvtf   v15.4s,  v31.4s\n"                \
+  "ld1     {v31.2s},   [%[bias_ptr]]\n"      \
+  "ld1     {v30.2s},   [%[scale_ptr]]\n"     \
+  "dup     v16.4s,  v31.s[0]\n"              \
+  "dup     v17.4s,  v31.s[0]\n"              \
+  "dup     v18.4s,  v31.s[0]\n"              \
+  "dup     v19.4s,  v31.s[0]\n"              \
+  "dup     v20.4s,  v31.s[0]\n"              \
+  "dup     v21.4s,  v31.s[0]\n"              \
+  "dup     v22.4s,  v31.s[0]\n"              \
+  "dup     v23.4s,  v31.s[0]\n"              \
+  "dup     v24.4s,  v31.s[1]\n"              \
+  "dup     v25.4s,  v31.s[1]\n"              \
+  "dup     v26.4s,  v31.s[1]\n"              \
+  "dup     v27.4s,  v31.s[1]\n"              \
+  "fmla    v16.4s,  v0.4s,  v30.s[0]\n"      \
+  "fmla    v17.4s,  v1.4s,  v30.s[0]\n"      \
+  "fmla    v18.4s,  v2.4s,  v30.s[0]\n"      \
+  "fmla    v19.4s,  v3.4s,  v30.s[0]\n"      \
+  "fmla    v20.4s,  v4.4s,  v30.s[0]\n"      \
+  "fmla    v21.4s,  v5.4s,  v30.s[0]\n"      \
+  "fmla    v22.4s,  v6.4s,  v30.s[0]\n"      \
+  "fmla    v23.4s,  v7.4s,  v30.s[0]\n"      \
+  "dup     v0.4s,   v30.s[1]\n"              \
+  "dup     v28.4s,  v24.s[0]\n"              \
+  "dup     v29.4s,  v24.s[0]\n"              \
+  "dup     v30.4s,  v24.s[0]\n"              \
+  "dup     v31.4s,  v24.s[0]\n"              \
+  "fmla    v24.4s,  v8.4s,  v0.s[0]\n"       \
+  "fmla    v25.4s,  v9.4s,  v0.s[0]\n"       \
+  "fmla    v26.4s,  v10.4s, v0.s[0]\n"       \
+  "fmla    v27.4s,  v11.4s, v0.s[0]\n"       \
+  "fmla    v28.4s,  v12.4s, v0.s[0]\n"       \
+  "fmla    v29.4s,  v13.4s, v0.s[0]\n"       \
+  "fmla    v30.4s,  v14.4s, v0.s[0]\n"       \
+  "fmla    v31.4s,  v15.4s, v0.s[0]\n"
+
+#define SPARSE_INT8_INT8_W16_V8_SEMI2_KERNEL \
+  "eor v8.16b,  v1.16b, v1.16b\n"            \
+  "eor v9.16b,  v2.16b, v2.16b\n"            \
+  "eor v10.16b, v3.16b, v3.16b\n"            \
+  "eor v11.16b, v4.16b, v4.16b\n"            \
+  "ld1     {v0.2s},   [%[bias_ptr]]\n"       \
+  "eor v12.16b, v5.16b, v5.16b\n"            \
+  "eor v13.16b, v6.16b, v6.16b\n"            \
+  "eor v14.16b, v7.16b, v7.16b\n"            \
+  "eor v15.16b, v8.16b, v8.16b\n"            \
+  "dup     v24.4s,  v0.s[0]\n"               \
+  "dup     v25.4s,  v0.s[0]\n"               \
+  "prfm  pldl1keep, [%[a_ptr], #64]\n"       \
+  "dup     v26.4s,  v0.s[0]\n"               \
+  "dup     v27.4s,  v0.s[0]\n"               \
+  "prfm  pldl1keep, [%[widx_dmap], #64]\n"   \
+  "dup     v28.4s,  v0.s[1]\n"               \
+  "dup     v29.4s,  v0.s[1]\n"               \
+  "prfm  pldl1keep, [%[b_ptr], #64]\n"       \
+  "dup     v30.4s,  v0.s[1]\n"               \
+  "dup     v31.4s,  v0.s[1]\n"               \
+  "ld1     {v0.2s},   [%[scale_ptr]]\n"      \
+  "cbz    %w[k],    1f\n"                    \
+  "0:\n"                                     \
+  "ld1r  {v1.16b}, [%[a_ptr]], #1\n"         \
+  "ldr   w1, [%[widx_dmap]],   #4\n"         \
+  "ld1r  {v2.16b}, [%[a_ptr]], #1\n"         \
+  "sxtw  x1,  w1\n"                          \
+  "ld1   {v3.16b}, [%[b_ptr]]\n"             \
+  "subs    %w[k],   %w[k],   #1\n"           \
+  "smull   v4.8h,   v1.8b,   v3.8b\n"        \
+  "smull2  v5.8h,   v1.16b,  v3.16b\n"       \
+  "smull   v6.8h,   v2.8b,   v3.8b\n"        \
+  "smull2  v7.8h,   v2.16b,  v3.16b\n"       \
+  "add   %[b_ptr],  %[b_ptr], x1\n"          \
+  "saddw   v8.4s,   v8.4s,   v4.4h\n"        \
+  "saddw2  v9.4s,   v9.4s,   v4.8h\n"        \
+  "prfm  pldl1keep, [%[b_ptr], #64]\n"       \
+  "saddw   v10.4s,  v10.4s,  v5.4h\n"        \
+  "saddw2  v11.4s,  v11.4s,  v5.8h\n"        \
+  "saddw   v12.4s,  v12.4s,  v6.4h\n"        \
+  "prfm  pldl1keep, [%[a_ptr], #64]\n"       \
+  "saddw2  v13.4s,  v13.4s,  v6.8h\n"        \
+  "saddw   v14.4s,  v14.4s,  v7.4h\n"        \
+  "saddw2  v15.4s,  v15.4s,  v7.8h\n"        \
+  "bne     0b\n"                             \
+  "1:\n"                                     \
+  "scvtf   v1.4s,   v8.4s\n"                 \
+  "scvtf   v2.4s,   v9.4s\n"                 \
+  "scvtf   v3.4s,   v10.4s\n"                \
+  "scvtf   v4.4s,   v11.4s\n"                \
+  "scvtf   v5.4s,   v12.4s\n"                \
+  "scvtf   v6.4s,   v13.4s\n"                \
+  "scvtf   v7.4s,   v14.4s\n"                \
+  "scvtf   v8.4s,   v15.4s\n"                \
+  "fmla    v24.4s,  v1.4s,  v0.s[0]\n"       \
+  "fmla    v25.4s,  v2.4s,  v0.s[0]\n"       \
+  "fmla    v26.4s,  v3.4s,  v0.s[0]\n"       \
+  "fmla    v27.4s,  v4.4s,  v0.s[0]\n"       \
+  "fmla    v28.4s,  v5.4s,  v0.s[1]\n"       \
+  "fmla    v29.4s,  v6.4s,  v0.s[1]\n"       \
+  "fmla    v30.4s,  v7.4s,  v0.s[1]\n"       \
+  "fmla    v31.4s,  v8.4s,  v0.s[1]\n"
+
+#define SPARSE_INT8_INT8_W8_V8_SEMI2_KERNEL \
+  "eor v8.16b,  v1.16b, v1.16b\n"           \
+  "eor v9.16b,  v2.16b, v2.16b\n"           \
+  "eor v10.16b, v3.16b, v3.16b\n"           \
+  "eor v11.16b, v4.16b, v4.16b\n"           \
+  "ld1     {v0.2s},   [%[bias_ptr]]\n"      \
+  "dup     v28.4s,  v0.s[0]\n"              \
+  "dup     v29.4s,  v0.s[0]\n"              \
+  "prfm  pldl1keep, [%[b_ptr], #64]\n"      \
+  "dup     v30.4s,  v0.s[1]\n"              \
+  "dup     v31.4s,  v0.s[1]\n"              \
+  "ld1     {v0.2s},   [%[scale_ptr]]\n"     \
+  "cbz    %w[k],    1f\n"                   \
+  "0:\n"                                    \
+  "ld1r  {v1.16b}, [%[a_ptr]], #1\n"        \
+  "ldr   w1, [%[widx_dmap]],   #4\n"        \
+  "ld1r  {v2.16b}, [%[a_ptr]], #1\n"        \
+  "sxtw  x1,  w1\n"                         \
+  "ld1   {v3.8b}, [%[b_ptr]]\n"             \
+  "subs    %w[k],   %w[k],   #1\n"          \
+  "smull   v4.8h,   v1.8b,   v3.8b\n"       \
+  "smull   v5.8h,   v2.8b,   v3.8b\n"       \
+  "add   %[b_ptr],  %[b_ptr], x1\n"         \
+  "saddw   v8.4s,   v8.4s,   v4.4h\n"       \
+  "saddw2  v9.4s,   v9.4s,   v4.8h\n"       \
+  "prfm  pldl1keep, [%[b_ptr], #64]\n"      \
+  "saddw   v10.4s,  v10.4s,  v5.4h\n"       \
+  "saddw2  v11.4s,  v11.4s,  v5.8h\n"       \
+  "bne     0b\n"                            \
+  "1:\n"                                    \
+  "scvtf   v1.4s,   v8.4s\n"                \
+  "scvtf   v2.4s,   v9.4s\n"                \
+  "scvtf   v3.4s,   v10.4s\n"               \
+  "scvtf   v4.4s,   v11.4s\n"               \
+  "fmla    v28.4s,  v1.4s,  v0.s[0]\n"      \
+  "fmla    v29.4s,  v2.4s,  v0.s[0]\n"      \
+  "fmla    v30.4s,  v3.4s,  v0.s[1]\n"      \
+  "fmla    v31.4s,  v4.4s,  v0.s[1]\n"
+
+#define SPARSE_INT8_INT8_W4_V8_SEMI2_KERNEL \
+  "eor v8.16b,  v1.16b, v1.16b\n"           \
+  "eor v9.16b,  v2.16b, v2.16b\n"           \
+  "ld1     {v0.2s},   [%[bias_ptr]]\n"      \
+  "dup     v30.4s,  v0.s[0]\n"              \
+  "dup     v31.4s,  v0.s[1]\n"              \
+  "ld1     {v0.2s},   [%[scale_ptr]]\n"     \
+  "cbz    %w[k],    1f\n"                   \
+  "0:\n"                                    \
+  "ld1r  {v1.16b}, [%[a_ptr]], #1\n"        \
+  "ldr   w1, [%[widx_dmap]],   #4\n"        \
+  "ld1r  {v2.16b}, [%[a_ptr]], #1\n"        \
+  "ldrsb   w2, [%[b_ptr]]\n"                \
+  "ldrsb   w3, [%[b_ptr], #1]\n"            \
+  "subs    %w[k],   %w[k],   #1\n"          \
+  "ldrsb   w4, [%[b_ptr], #2]\n"            \
+  "ldrsb   w5, [%[b_ptr], #3]\n"            \
+  "sxtw  x1,  w1\n"                         \
+  "mov   v3.b[0], w2\n"                     \
+  "mov   v3.b[1], w3\n"                     \
+  "mov   v3.b[2], w4\n"                     \
+  "mov   v3.b[3], w5\n"                     \
+  "smull   v4.8h,   v1.8b,   v3.8b\n"       \
+  "smull   v5.8h,   v2.8b,   v3.8b\n"       \
+  "add   %[b_ptr],  %[b_ptr], x1\n"         \
+  "saddw   v8.4s,   v8.4s,   v4.4h\n"       \
+  "saddw   v9.4s,   v9.4s,   v5.4h\n"       \
+  "bne     0b\n"                            \
+  "1:\n"                                    \
+  "scvtf   v1.4s,   v8.4s\n"                \
+  "scvtf   v2.4s,   v9.4s\n"                \
+  "fmla    v30.4s,  v1.4s,  v0.s[0]\n"      \
+  "fmla    v31.4s,  v2.4s,  v0.s[1]\n"
+
+#define SPARSE_INT8_INT8_W32_V8_SEMI1_KERNEL \
+  "eor v11.16b, v0.16b, v0.16b\n"            \
+  "eor v12.16b, v1.16b, v1.16b\n"            \
+  "prfm  pldl1keep, [%[a_ptr], #32]\n"       \
+  "eor v13.16b, v2.16b, v2.16b\n"            \
+  "eor v14.16b, v3.16b, v3.16b\n"            \
+  "prfm  pldl1keep, [%[widx_dmap], #32]\n"   \
+  "eor v15.16b, v4.16b, v4.16b\n"            \
+  "eor v16.16b, v5.16b, v5.16b\n"            \
+  "prfm  pldl1keep, [%[b_ptr], #32]\n"       \
+  "eor v17.16b, v6.16b, v6.16b\n"            \
+  "eor v18.16b, v7.16b, v7.16b\n"            \
+  "dup     v21.4s,  %w[vbias]\n"             \
+  "dup     v22.4s,  v21.s[0]\n"              \
+  "dup     v23.4s,  v21.s[0]\n"              \
+  "dup     v24.4s,  v21.s[0]\n"              \
+  "dup     v25.4s,  v21.s[0]\n"              \
+  "dup     v26.4s,  v21.s[0]\n"              \
+  "dup     v27.4s,  v21.s[0]\n"              \
+  "dup     v28.4s,  v21.s[0]\n"              \
+  "cbz    %w[k],    1f\n"                    \
+  "0:\n"                                     \
+  "ld1r  {v0.16b}, [%[a_ptr]], #1\n"         \
+  "ldr   w1, [%[widx_dmap]],   #4\n"         \
+  "sxtw  x1,  w1\n"                          \
+  "ld1   {v1.16b, v2.16b}, [%[b_ptr]]\n"     \
+  "add   %[b_ptr],  %[b_ptr], x1\n"          \
+  "smull   v3.8h,   v0.8b,   v1.8b\n"        \
+  "smull   v5.8h,   v0.8b,   v2.8b\n"        \
+  "subs    %w[k],   %w[k],   #1\n"           \
+  "smull2  v4.8h,   v0.16b,  v1.16b\n"       \
+  "smull2  v6.8h,   v0.16b,  v2.16b\n"       \
+  "saddw   v11.4s,  v11.4s,  v3.4h\n"        \
+  "saddw   v13.4s,  v13.4s,  v4.4h\n"        \
+  "prfm  pldl1keep, [%[b_ptr], #32]\n"       \
+  "saddw   v15.4s,  v15.4s,  v5.4h\n"        \
+  "saddw   v17.4s,  v17.4s,  v6.4h\n"        \
+  "saddw2  v12.4s,  v12.4s,  v3.8h\n"        \
+  "saddw2  v14.4s,  v14.4s,  v4.8h\n"        \
+  "saddw2  v16.4s,  v16.4s,  v5.8h\n"        \
+  "saddw2  v18.4s,  v18.4s,  v6.8h\n"        \
+  "bne     0b\n"                             \
+  "1:\n"                                     \
+  "scvtf   v3.4s,  v11.4s\n"                 \
+  "scvtf   v4.4s,  v12.4s\n"                 \
+  "scvtf   v5.4s,  v13.4s\n"                 \
+  "scvtf   v6.4s,  v14.4s\n"                 \
+  "scvtf   v7.4s,  v15.4s\n"                 \
+  "scvtf   v8.4s,  v16.4s\n"                 \
+  "scvtf   v9.4s,  v17.4s\n"                 \
+  "scvtf   v10.4s, v18.4s\n" /* scale */     \
+  "dup     v31.4s,  %w[vscale]\n"            \
+  "fmla    v21.4s,  v3.4s,  v31.s[0]\n"      \
+  "fmla    v22.4s,  v4.4s,  v31.s[0]\n"      \
+  "fmla    v23.4s,  v5.4s,  v31.s[0]\n"      \
+  "fmla    v24.4s,  v6.4s,  v31.s[0]\n"      \
+  "fmla    v25.4s,  v7.4s,  v31.s[0]\n"      \
+  "fmla    v26.4s,  v8.4s,  v31.s[0]\n"      \
+  "fmla    v27.4s,  v9.4s,  v31.s[0]\n"      \
+  "fmla    v28.4s,  v10.4s, v31.s[0]\n"
+
+#define SPARSE_INT8_INT8_W16_V8_SEMI1_KERNEL \
+  "eor v11.16b, v0.16b, v0.16b\n"            \
+  "eor v12.16b, v1.16b, v1.16b\n"            \
+  "eor v13.16b, v2.16b, v2.16b\n"            \
+  "prfm  pldl1keep, [%[b_ptr], #16]\n"       \
+  "eor v14.16b, v3.16b, v3.16b\n"            \
+  "dup     v21.4s,  %w[vbias]\n"             \
+  "dup     v22.4s,  v21.s[0]\n"              \
+  "dup     v23.4s,  v21.s[0]\n"              \
+  "dup     v24.4s,  v21.s[0]\n"              \
+  "cbz    %w[k],    1f\n"                    \
+  "0:\n"                                     \
+  "ld1r  {v0.16b}, [%[a_ptr]], #1\n"         \
+  "ldr   w1, [%[widx_dmap]],   #4\n"         \
+  "sxtw  x1,  w1\n"                          \
+  "ld1   {v1.16b}, [%[b_ptr]]\n"             \
+  "add   %[b_ptr],  %[b_ptr], x1\n"          \
+  "smull   v3.8h,   v0.8b,   v1.8b\n"        \
+  "smull2  v4.8h,   v0.16b,  v1.16b\n"       \
+  "prfm  pldl1keep, [%[b_ptr], #16]\n"       \
+  "subs    %w[k],   %w[k],   #1\n"           \
+  "saddw   v11.4s,  v11.4s,  v3.4h\n"        \
+  "saddw2  v12.4s,  v12.4s,  v3.8h\n"        \
+  "saddw   v13.4s,  v13.4s,  v4.4h\n"        \
+  "saddw2  v14.4s,  v14.4s,  v4.8h\n"        \
+  "bne     0b\n"                             \
+  "1:\n"                                     \
+  "scvtf   v5.4s,  v11.4s\n"                 \
+  "scvtf   v6.4s,  v12.4s\n"                 \
+  "scvtf   v7.4s,  v13.4s\n"                 \
+  "scvtf   v8.4s,  v14.4s\n" /* scale */     \
+  "dup     v2.4s,  %w[vscale]\n"             \
+  "fmla    v21.4s,  v5.4s,  v2.s[0]\n"       \
+  "fmla    v22.4s,  v6.4s,  v2.s[0]\n"       \
+  "fmla    v23.4s,  v7.4s,  v2.s[0]\n"       \
+  "fmla    v24.4s,  v8.4s,  v2.s[0]\n"
+
+#define SPARSE_INT8_INT8_W8_V8_SEMI1_KERNEL \
+  "eor v11.16b, v0.16b, v0.16b\n"           \
+  "eor v12.16b, v1.16b, v1.16b\n"           \
+  "dup     v21.4s,  %w[vbias]\n"            \
+  "dup     v22.4s,  v21.s[0]\n"             \
+  "cbz    %w[k],    1f\n"                   \
+  "0:\n"                                    \
+  "ld1r  {v0.8b}, [%[a_ptr]], #1\n"         \
+  "ldr   w1, [%[widx_dmap]],   #4\n"        \
+  "sxtw  x1,  w1\n"                         \
+  "ld1   {v1.8b}, [%[b_ptr]]\n"             \
+  "add   %[b_ptr],  %[b_ptr], x1\n"         \
+  "smull   v3.8h,   v0.8b,   v1.8b\n"       \
+  "subs    %w[k],   %w[k],   #1\n"          \
+  "saddw   v11.4s,  v11.4s,  v3.4h\n"       \
+  "saddw2  v12.4s,  v12.4s,  v3.8h\n"       \
+  "bne     0b\n"                            \
+  "1:\n"                                    \
+  "scvtf   v4.4s,  v11.4s\n"                \
+  "scvtf   v5.4s,  v12.4s\n" /* scale */    \
+  "dup     v2.4s,   %w[vscale]\n"           \
+  "fmla    v21.4s,  v4.4s,  v2.s[0]\n"      \
+  "fmla    v22.4s,  v5.4s,  v2.s[0]\n"
+
+#define SPARSE_INT8_INT8_W4_V8_SEMI1_KERNEL \
+  "eor v11.16b, v0.16b, v0.16b\n"           \
+  "dup     v21.4s,  %w[vbias]\n"            \
+  "cbz    %w[k],    1f\n"                   \
+  "0:\n"                                    \
+  "ld1r  {v0.8b}, [%[a_ptr]], #1\n"         \
+  "ldr   w1, [%[widx_dmap]],   #4\n"        \
+  "ldrsb   w2, [%[b_ptr]]\n"                \
+  "ldrsb   w3, [%[b_ptr], #1]\n"            \
+  "ldrsb   w4, [%[b_ptr], #2]\n"            \
+  "ldrsb   w5, [%[b_ptr], #3]\n"            \
+  "sxtw  x1,  w1\n"                         \
+  "mov   v1.b[0], w2\n"                     \
+  "mov   v1.b[1], w3\n"                     \
+  "mov   v1.b[2], w4\n"                     \
+  "mov   v1.b[3], w5\n"                     \
+  "add   %[b_ptr],  %[b_ptr], x1\n"         \
+  "smull   v3.8h,   v0.8b,   v1.8b\n"       \
+  "subs    %w[k],   %w[k],   #1\n"          \
+  "saddw   v11.4s,  v11.4s,  v3.4h\n"       \
+  "bne     0b\n"                            \
+  "1:\n"                                    \
+  "scvtf   v4.4s,  v11.4s\n" /* scale */    \
+  "dup     v2.4s,   %w[vscale]\n"           \
+  "fmla    v21.4s,  v4.4s,  v2.s[0]\n"
+
+#define SPARSE_INT8_INT8_W48_V8_SEMI_RELU             \
+  /* do relu */                                       \
+  "cmp    %w[vflag_act],    #0\n"    /* skip relu */  \
+  "beq   9f                     \n"  /* no act end */ \
+  "cmp    %w[vflag_act],    #1\n"    /* skip relu */  \
+  "bne   10f                     \n" /* other act */  \
+  "movi   v0.4s, #0\n"               /* for relu */   \
+  "fmax   v8.4s,  v8.4s,  v0.4s\n"   /* relu */       \
+  "fmax   v9.4s,  v9.4s,  v0.4s\n"   /* relu */       \
+  "fmax   v10.4s, v10.4s, v0.4s\n"   /* relu */       \
+  "fmax   v11.4s, v11.4s, v0.4s\n"   /* relu */       \
+  "fmax   v12.4s, v12.4s, v0.4s\n"   /* relu */       \
+  "fmax   v13.4s, v13.4s, v0.4s\n"   /* relu */       \
+  "fmax   v14.4s, v14.4s, v0.4s\n"   /* relu */       \
+  "fmax   v15.4s, v15.4s, v0.4s\n"   /* relu */       \
+  "fmax   v16.4s, v16.4s, v0.4s\n"   /* relu */       \
+  "fmax   v17.4s, v17.4s, v0.4s\n"   /* relu */       \
+  "fmax   v18.4s, v18.4s, v0.4s\n"   /* relu */       \
+  "fmax   v19.4s, v19.4s, v0.4s\n"   /* relu */       \
+  "fmax   v20.4s, v20.4s, v0.4s\n"   /* relu */       \
+  "fmax   v21.4s, v21.4s, v0.4s\n"   /* relu */       \
+  "fmax   v22.4s, v22.4s, v0.4s\n"   /* relu */       \
+  "fmax   v23.4s, v23.4s, v0.4s\n"   /* relu */       \
+  "fmax   v24.4s, v24.4s, v0.4s\n"   /* relu */       \
+  "fmax   v25.4s, v25.4s, v0.4s\n"   /* relu */       \
+  "fmax   v26.4s, v26.4s, v0.4s\n"   /* relu */       \
+  "fmax   v27.4s, v27.4s, v0.4s\n"   /* relu */       \
+  "fmax   v28.4s, v28.4s, v0.4s\n"   /* relu */       \
+  "fmax   v29.4s, v29.4s, v0.4s\n"   /* relu */       \
+  "fmax   v30.4s, v30.4s, v0.4s\n"   /* relu */       \
+  "fmax   v31.4s, v31.4s, v0.4s\n"   /* relu */       \
+  "b      9f                    \n"  /* relu end */
+
+#define SPARSE_INT8_INT8_W32_V8_SEMI2_RELU            \
+  /* do relu */                                       \
+  "cmp    %w[vflag_act],    #0\n"    /* skip relu */  \
+  "beq   9f                     \n"  /* no act end */ \
+  "cmp    %w[vflag_act],    #1\n"    /* skip relu */  \
+  "bne   10f                     \n" /* other act */  \
+  "movi   v0.4s, #0\n"               /* for relu */   \
+  "fmax   v16.4s, v16.4s, v0.4s\n"   /* relu */       \
+  "fmax   v17.4s, v17.4s, v0.4s\n"   /* relu */       \
+  "fmax   v18.4s, v18.4s, v0.4s\n"   /* relu */       \
+  "fmax   v19.4s, v19.4s, v0.4s\n"   /* relu */       \
+  "fmax   v20.4s, v20.4s, v0.4s\n"   /* relu */       \
+  "fmax   v21.4s, v21.4s, v0.4s\n"   /* relu */       \
+  "fmax   v22.4s, v22.4s, v0.4s\n"   /* relu */       \
+  "fmax   v23.4s, v23.4s, v0.4s\n"   /* relu */       \
+  "fmax   v24.4s, v24.4s, v0.4s\n"   /* relu */       \
+  "fmax   v25.4s, v25.4s, v0.4s\n"   /* relu */       \
+  "fmax   v26.4s, v26.4s, v0.4s\n"   /* relu */       \
+  "fmax   v27.4s, v27.4s, v0.4s\n"   /* relu */       \
+  "fmax   v28.4s, v28.4s, v0.4s\n"   /* relu */       \
+  "fmax   v29.4s, v29.4s, v0.4s\n"   /* relu */       \
+  "fmax   v30.4s, v30.4s, v0.4s\n"   /* relu */       \
+  "fmax   v31.4s, v31.4s, v0.4s\n"   /* relu */       \
+  "b      9f                    \n"  /* relu end */
+
+#define SPARSE_INT8_INT8_W16_V8_SEMI2_RELU            \
+  /* do relu */                                       \
+  "cmp    %w[vflag_act],    #0\n"    /* skip relu */  \
+  "beq   9f                     \n"  /* no act end */ \
+  "cmp    %w[vflag_act],    #1\n"    /* skip relu */  \
+  "bne   10f                     \n" /* other act */  \
+  "movi   v0.4s, #0\n"               /* for relu */   \
+  "fmax   v24.4s, v24.4s, v0.4s\n"   /* relu */       \
+  "fmax   v25.4s, v25.4s, v0.4s\n"   /* relu */       \
+  "fmax   v26.4s, v26.4s, v0.4s\n"   /* relu */       \
+  "fmax   v27.4s, v27.4s, v0.4s\n"   /* relu */       \
+  "fmax   v28.4s, v28.4s, v0.4s\n"   /* relu */       \
+  "fmax   v29.4s, v29.4s, v0.4s\n"   /* relu */       \
+  "fmax   v30.4s, v30.4s, v0.4s\n"   /* relu */       \
+  "fmax   v31.4s, v31.4s, v0.4s\n"   /* relu */       \
+  "b      9f                    \n"  /* relu end */
+
+#define SPARSE_INT8_INT8_W8_V8_SEMI2_RELU             \
+  /* do relu */                                       \
+  "cmp    %w[vflag_act],    #0\n"    /* skip relu */  \
+  "beq   9f                     \n"  /* no act end */ \
+  "cmp    %w[vflag_act],    #1\n"    /* skip relu */  \
+  "bne   10f                     \n" /* other act */  \
+  "movi   v0.4s, #0\n"               /* for relu */   \
+  "fmax   v28.4s, v28.4s, v0.4s\n"   /* relu */       \
+  "fmax   v29.4s, v29.4s, v0.4s\n"   /* relu */       \
+  "fmax   v30.4s, v30.4s, v0.4s\n"   /* relu */       \
+  "fmax   v31.4s, v31.4s, v0.4s\n"   /* relu */       \
+  "b      9f                    \n"  /* relu end */
+
+#define SPARSE_INT8_INT8_W4_V8_SEMI2_RELU             \
+  /* do relu */                                       \
+  "cmp    %w[vflag_act],    #0\n"    /* skip relu */  \
+  "beq   9f                     \n"  /* no act end */ \
+  "cmp    %w[vflag_act],    #1\n"    /* skip relu */  \
+  "bne   10f                     \n" /* other act */  \
+  "movi   v0.4s, #0\n"               /* for relu */   \
+  "fmax   v30.4s, v30.4s, v0.4s\n"   /* relu */       \
+  "fmax   v31.4s, v31.4s, v0.4s\n"   /* relu */       \
+  "b      9f                    \n"  /* relu end */
+
+#define SPARSE_INT8_INT8_W32_V8_SEMI1_RELU            \
+  /* do relu */                                       \
+  "cmp    %w[vflag_act],    #0\n"    /* skip relu */  \
+  "beq   9f                     \n"  /* no act end */ \
+  "cmp    %w[vflag_act],    #1\n"    /* skip relu */  \
+  "bne   10f                     \n" /* other act */  \
+  "movi   v30.4s, #0\n"              /* for relu */   \
+  "fmax   v21.4s, v21.4s, v30.4s\n"  /* relu */       \
+  "fmax   v22.4s, v22.4s, v30.4s\n"  /* relu */       \
+  "fmax   v23.4s, v23.4s, v30.4s\n"  /* relu */       \
+  "fmax   v24.4s, v24.4s, v30.4s\n"  /* relu */       \
+  "fmax   v25.4s, v25.4s, v30.4s\n"  /* relu */       \
+  "fmax   v26.4s, v26.4s, v30.4s\n"  /* relu */       \
+  "fmax   v27.4s, v27.4s, v30.4s\n"  /* relu */       \
+  "fmax   v28.4s, v28.4s, v30.4s\n"  /* relu */       \
+  "b      9f                    \n"  /* relu end */
+
+#define SPARSE_INT8_INT8_W16_V8_SEMI1_RELU            \
+  /* do relu */                                       \
+  "cmp    %w[vflag_act],    #0\n"    /* skip relu */  \
+  "beq   9f                     \n"  /* no act end */ \
+  "cmp    %w[vflag_act],    #1\n"    /* skip relu */  \
+  "bne   10f                     \n" /* other act */  \
+  "movi   v9.4s, #0\n"               /* for relu */   \
+  "fmax   v21.4s, v21.4s, v9.4s\n"   /* relu */       \
+  "fmax   v22.4s, v22.4s, v9.4s\n"   /* relu */       \
+  "fmax   v23.4s, v23.4s, v9.4s\n"   /* relu */       \
+  "fmax   v24.4s, v24.4s, v9.4s\n"   /* relu */       \
+  "b      9f                    \n"  /* relu end */
+
+#define SPARSE_INT8_INT8_W8_V8_SEMI1_RELU             \
+  /* do relu */                                       \
+  "cmp    %w[vflag_act],    #0\n"    /* skip relu */  \
+  "beq   9f                     \n"  /* no act end */ \
+  "cmp    %w[vflag_act],    #1\n"    /* skip relu */  \
+  "bne   10f                     \n" /* other act */  \
+  "movi   v9.4s, #0\n"               /* for relu */   \
+  "fmax   v21.4s, v21.4s, v9.4s\n"   /* relu */       \
+  "fmax   v22.4s, v22.4s, v9.4s\n"   /* relu */       \
+  "b      9f                    \n"  /* relu end */
+
+#define SPARSE_INT8_INT8_W4_V8_SEMI1_RELU             \
+  /* do relu */                                       \
+  "cmp    %w[vflag_act],    #0\n"    /* skip relu */  \
+  "beq   9f                     \n"  /* no act end */ \
+  "cmp    %w[vflag_act],    #1\n"    /* skip relu */  \
+  "bne   10f                     \n" /* other act */  \
+  "movi   v9.4s, #0\n"               /* for relu */   \
+  "fmax   v21.4s, v21.4s, v9.4s\n"   /* relu */       \
+  "b      9f                    \n"  /* relu end */
+
+#define SPARSE_INT8_INT8_W48_V8_SEMI_RELU6              \
+  /* do relu6 */                                        \
+  "10: \n"                                              \
+  "cmp   %w[vflag_act],  #2       \n" /* check relu6 */ \
+  "bne   11f                     \n"  /* no act end */  \
+  "movi   v0.4s, #0\n"                /* for relu6 */   \
+  "dup    v1.4s,  %w[valpha]\n"       /* relu6 alpha */ \
+  "fmax   v8.4s,  v8.4s,  v0.4s\n"    /* relu */        \
+  "fmax   v9.4s,  v9.4s,  v0.4s\n"    /* relu */        \
+  "fmax   v10.4s, v10.4s, v0.4s\n"    /* relu */        \
+  "fmax   v11.4s, v11.4s, v0.4s\n"    /* relu */        \
+  "fmax   v12.4s, v12.4s, v0.4s\n"    /* relu */        \
+  "fmax   v13.4s, v13.4s, v0.4s\n"    /* relu */        \
+  "fmax   v14.4s, v14.4s, v0.4s\n"    /* relu */        \
+  "fmax   v15.4s, v15.4s, v0.4s\n"    /* relu */        \
+  "fmax   v16.4s, v16.4s, v0.4s\n"    /* relu */        \
+  "fmax   v17.4s, v17.4s, v0.4s\n"    /* relu */        \
+  "fmax   v18.4s, v18.4s, v0.4s\n"    /* relu */        \
+  "fmax   v19.4s, v19.4s, v0.4s\n"    /* relu */        \
+  "fmax   v20.4s, v20.4s, v0.4s\n"    /* relu */        \
+  "fmax   v21.4s, v21.4s, v0.4s\n"    /* relu */        \
+  "fmax   v22.4s, v22.4s, v0.4s\n"    /* relu */        \
+  "fmax   v23.4s, v23.4s, v0.4s\n"    /* relu */        \
+  "fmax   v24.4s, v24.4s, v0.4s\n"    /* relu */        \
+  "fmax   v25.4s, v25.4s, v0.4s\n"    /* relu */        \
+  "fmax   v26.4s, v26.4s, v0.4s\n"    /* relu */        \
+  "fmax   v27.4s, v27.4s, v0.4s\n"    /* relu */        \
+  "fmax   v28.4s, v28.4s, v0.4s\n"    /* relu */        \
+  "fmax   v29.4s, v29.4s, v0.4s\n"    /* relu */        \
+  "fmax   v30.4s, v30.4s, v0.4s\n"    /* relu */        \
+  "fmax   v31.4s, v31.4s, v0.4s\n"    /* relu */        \
+  "fmin   v8.4s,  v8.4s,  v1.4s\n"    /* relu */        \
+  "fmin   v9.4s,  v9.4s,  v1.4s\n"    /* relu */        \
+  "fmin   v10.4s, v10.4s, v1.4s\n"    /* relu */        \
+  "fmin   v11.4s, v11.4s, v1.4s\n"    /* relu */        \
+  "fmin   v12.4s, v12.4s, v1.4s\n"    /* relu */        \
+  "fmin   v13.4s, v13.4s, v1.4s\n"    /* relu */        \
+  "fmin   v14.4s, v14.4s, v1.4s\n"    /* relu */        \
+  "fmin   v15.4s, v15.4s, v1.4s\n"    /* relu */        \
+  "fmin   v16.4s, v16.4s, v1.4s\n"    /* relu */        \
+  "fmin   v17.4s, v17.4s, v1.4s\n"    /* relu */        \
+  "fmin   v18.4s, v18.4s, v1.4s\n"    /* relu */        \
+  "fmin   v19.4s, v19.4s, v1.4s\n"    /* relu */        \
+  "fmin   v20.4s, v20.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v21.4s, v21.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v22.4s, v22.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v23.4s, v23.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v24.4s, v24.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v25.4s, v25.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v26.4s, v26.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v27.4s, v27.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v28.4s, v28.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v29.4s, v29.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v30.4s, v30.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v31.4s, v31.4s, v1.4s\n"    /* relu6 */       \
+  "b      9f                    \n"   /* relu end */
+
+#define SPARSE_INT8_INT8_W32_V8_SEMI2_RELU6             \
+  /* do relu6 */                                        \
+  "10: \n"                                              \
+  "cmp   %w[vflag_act],  #2       \n" /* check relu6 */ \
+  "bne   11f                     \n"  /* no act end */  \
+  "movi   v0.4s, #0\n"                /* for relu6 */   \
+  "dup    v1.4s,  %w[valpha]\n"       /* relu6 alpha */ \
+  "fmax   v16.4s, v16.4s, v0.4s\n"    /* relu */        \
+  "fmax   v17.4s, v17.4s, v0.4s\n"    /* relu */        \
+  "fmax   v18.4s, v18.4s, v0.4s\n"    /* relu */        \
+  "fmax   v19.4s, v19.4s, v0.4s\n"    /* relu */        \
+  "fmax   v20.4s, v20.4s, v0.4s\n"    /* relu */        \
+  "fmax   v21.4s, v21.4s, v0.4s\n"    /* relu */        \
+  "fmax   v22.4s, v22.4s, v0.4s\n"    /* relu */        \
+  "fmax   v23.4s, v23.4s, v0.4s\n"    /* relu */        \
+  "fmax   v24.4s, v24.4s, v0.4s\n"    /* relu */        \
+  "fmax   v25.4s, v25.4s, v0.4s\n"    /* relu */        \
+  "fmax   v26.4s, v26.4s, v0.4s\n"    /* relu */        \
+  "fmax   v27.4s, v27.4s, v0.4s\n"    /* relu */        \
+  "fmax   v28.4s, v28.4s, v0.4s\n"    /* relu */        \
+  "fmax   v29.4s, v29.4s, v0.4s\n"    /* relu */        \
+  "fmax   v30.4s, v30.4s, v0.4s\n"    /* relu */        \
+  "fmax   v31.4s, v31.4s, v0.4s\n"    /* relu */        \
+  "fmin   v16.4s, v16.4s, v1.4s\n"    /* relu */        \
+  "fmin   v17.4s, v17.4s, v1.4s\n"    /* relu */        \
+  "fmin   v18.4s, v18.4s, v1.4s\n"    /* relu */        \
+  "fmin   v19.4s, v19.4s, v1.4s\n"    /* relu */        \
+  "fmin   v20.4s, v20.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v21.4s, v21.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v22.4s, v22.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v23.4s, v23.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v24.4s, v24.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v25.4s, v25.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v26.4s, v26.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v27.4s, v27.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v28.4s, v28.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v29.4s, v29.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v30.4s, v30.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v31.4s, v31.4s, v1.4s\n"    /* relu6 */       \
+  "b      9f                    \n"   /* relu end */
+
+#define SPARSE_INT8_INT8_W16_V8_SEMI2_RELU6             \
+  /* do relu6 */                                        \
+  "10: \n"                                              \
+  "cmp   %w[vflag_act],  #2       \n" /* check relu6 */ \
+  "bne   11f                     \n"  /* no act end */  \
+  "movi   v0.4s, #0\n"                /* for relu6 */   \
+  "dup    v1.4s,  %w[valpha]\n"       /* relu6 alpha */ \
+  "fmax   v24.4s, v24.4s, v0.4s\n"    /* relu */        \
+  "fmax   v25.4s, v25.4s, v0.4s\n"    /* relu */        \
+  "fmax   v26.4s, v26.4s, v0.4s\n"    /* relu */        \
+  "fmax   v27.4s, v27.4s, v0.4s\n"    /* relu */        \
+  "fmax   v28.4s, v28.4s, v0.4s\n"    /* relu */        \
+  "fmax   v29.4s, v29.4s, v0.4s\n"    /* relu */        \
+  "fmax   v30.4s, v30.4s, v0.4s\n"    /* relu */        \
+  "fmax   v31.4s, v31.4s, v0.4s\n"    /* relu */        \
+  "fmin   v24.4s, v24.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v25.4s, v25.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v26.4s, v26.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v27.4s, v27.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v28.4s, v28.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v29.4s, v29.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v30.4s, v30.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v31.4s, v31.4s, v1.4s\n"    /* relu6 */       \
+  "b      9f                    \n"   /* relu end */
+
+#define SPARSE_INT8_INT8_W8_V8_SEMI2_RELU6              \
+  /* do relu6 */                                        \
+  "10: \n"                                              \
+  "cmp   %w[vflag_act],  #2       \n" /* check relu6 */ \
+  "bne   11f                     \n"  /* no act end */  \
+  "movi   v0.4s, #0\n"                /* for relu6 */   \
+  "dup    v1.4s,  %w[valpha]\n"       /* relu6 alpha */ \
+  "fmax   v28.4s, v28.4s, v0.4s\n"    /* relu */        \
+  "fmax   v29.4s, v29.4s, v0.4s\n"    /* relu */        \
+  "fmax   v30.4s, v30.4s, v0.4s\n"    /* relu */        \
+  "fmax   v31.4s, v31.4s, v0.4s\n"    /* relu */        \
+  "fmin   v28.4s, v28.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v29.4s, v29.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v30.4s, v30.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v31.4s, v31.4s, v1.4s\n"    /* relu6 */       \
+  "b      9f                    \n"   /* relu end */
+
+#define SPARSE_INT8_INT8_W4_V8_SEMI2_RELU6              \
+  /* do relu6 */                                        \
+  "10: \n"                                              \
+  "cmp   %w[vflag_act],  #2       \n" /* check relu6 */ \
+  "bne   11f                     \n"  /* no act end */  \
+  "movi   v0.4s, #0\n"                /* for relu6 */   \
+  "dup    v1.4s,  %w[valpha]\n"       /* relu6 alpha */ \
+  "fmax   v30.4s, v30.4s, v0.4s\n"    /* relu */        \
+  "fmax   v31.4s, v31.4s, v0.4s\n"    /* relu */        \
+  "fmin   v30.4s, v30.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v31.4s, v31.4s, v1.4s\n"    /* relu6 */       \
+  "b      9f                    \n"   /* relu end */
+
+#define SPARSE_INT8_INT8_W32_V8_SEMI1_RELU6             \
+  /* do relu6 */                                        \
+  "10: \n"                                              \
+  "cmp   %w[vflag_act],  #2       \n" /* check relu6 */ \
+  "bne   11f                     \n"  /* no act end */  \
+  "movi   v0.4s, #0\n"                /* for relu6 */   \
+  "dup    v1.4s,  %w[valpha]\n"       /* relu6 alpha */ \
+  "fmax   v21.4s, v21.4s, v0.4s\n"    /* relu */        \
+  "fmax   v22.4s, v22.4s, v0.4s\n"    /* relu */        \
+  "fmax   v23.4s, v23.4s, v0.4s\n"    /* relu */        \
+  "fmax   v24.4s, v24.4s, v0.4s\n"    /* relu */        \
+  "fmax   v25.4s, v25.4s, v0.4s\n"    /* relu */        \
+  "fmax   v26.4s, v26.4s, v0.4s\n"    /* relu */        \
+  "fmax   v27.4s, v27.4s, v0.4s\n"    /* relu */        \
+  "fmax   v28.4s, v28.4s, v0.4s\n"    /* relu */        \
+  "fmin   v21.4s, v21.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v22.4s, v22.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v23.4s, v23.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v24.4s, v24.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v25.4s, v25.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v26.4s, v26.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v27.4s, v27.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v28.4s, v28.4s, v1.4s\n"    /* relu6 */       \
+  "b      9f                    \n"   /* relu end */
+
+#define SPARSE_INT8_INT8_W16_V8_SEMI1_RELU6             \
+  /* do relu6 */                                        \
+  "10: \n"                                              \
+  "cmp   %w[vflag_act],  #2       \n" /* check relu6 */ \
+  "bne   11f                     \n"  /* no act end */  \
+  "movi   v0.4s, #0\n"                /* for relu6 */   \
+  "dup    v1.4s,  %w[valpha]\n"       /* relu6 alpha */ \
+  "fmax   v21.4s, v21.4s, v0.4s\n"    /* relu */        \
+  "fmax   v22.4s, v22.4s, v0.4s\n"    /* relu */        \
+  "fmax   v23.4s, v23.4s, v0.4s\n"    /* relu */        \
+  "fmax   v24.4s, v24.4s, v0.4s\n"    /* relu */        \
+  "fmin   v21.4s, v21.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v22.4s, v22.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v23.4s, v23.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v24.4s, v24.4s, v1.4s\n"    /* relu6 */       \
+  "b      9f                    \n"   /* relu end */
+
+#define SPARSE_INT8_INT8_W8_V8_SEMI1_RELU6              \
+  /* do relu6 */                                        \
+  "10: \n"                                              \
+  "cmp   %w[vflag_act],  #2       \n" /* check relu6 */ \
+  "bne   11f                     \n"  /* no act end */  \
+  "movi   v0.4s, #0\n"                /* for relu6 */   \
+  "dup    v1.4s,  %w[valpha]\n"       /* relu6 alpha */ \
+  "fmax   v21.4s, v21.4s, v0.4s\n"    /* relu */        \
+  "fmax   v22.4s, v22.4s, v0.4s\n"    /* relu */        \
+  "fmin   v21.4s, v21.4s, v1.4s\n"    /* relu6 */       \
+  "fmin   v22.4s, v22.4s, v1.4s\n"    /* relu6 */       \
+  "b      9f                    \n"   /* relu end */
+
+#define SPARSE_INT8_INT8_W4_V8_SEMI1_RELU6              \
+  /* do relu6 */                                        \
+  "10: \n"                                              \
+  "cmp   %w[vflag_act],  #2       \n" /* check relu6 */ \
+  "bne   11f                     \n"  /* no act end */  \
+  "movi   v0.4s, #0\n"                /* for relu6 */   \
+  "dup    v1.4s,  %w[valpha]\n"       /* relu6 alpha */ \
+  "fmax   v21.4s, v21.4s, v0.4s\n"    /* relu */        \
+  "fmin   v21.4s, v21.4s, v1.4s\n"    /* relu6 */       \
+  "b      9f                    \n"   /* relu end */
+
+#define SPARSE_INT8_INT8_W48_V8_SEMI_LEAKY_RELU                    \
+  /* do relu */                                                    \
+  "11: \n"                                                         \
+  "movi   v0.4s, #0\n"                     /* for relu6 */         \
+  "dup    v1.4s,  %w[valpha]\n"            /* leakey relu alpha */ \
+  "fcmge  v2.4s,    v8.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v3.4s,    v8.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v4.4s,    v9.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v5.4s,    v9.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v6.4s,    v10.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v7.4s,    v10.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "bif    v8.16b,   v3.16b,   v2.16b  \n"  /* choose*/             \
+  "bif    v9.16b,   v5.16b,   v4.16b  \n"  /* choose*/             \
+  "bif    v10.16b,  v7.16b,   v6.16b  \n"  /* choose*/             \
+  "fcmge  v2.4s,    v11.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v3.4s,    v11.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v4.4s,    v12.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v5.4s,    v12.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v6.4s,    v13.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v7.4s,    v13.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "bif    v11.16b,   v3.16b,   v2.16b  \n" /* choose*/             \
+  "bif    v12.16b,   v5.16b,   v4.16b  \n" /* choose*/             \
+  "bif    v13.16b,  v7.16b,   v6.16b  \n"  /* choose*/             \
+  "fcmge  v2.4s,    v14.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v3.4s,    v14.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v4.4s,    v15.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v5.4s,    v15.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v6.4s,    v16.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v7.4s,    v16.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "bif    v14.16b,   v3.16b,   v2.16b  \n" /* choose*/             \
+  "bif    v15.16b,   v5.16b,   v4.16b  \n" /* choose*/             \
+  "bif    v16.16b,  v7.16b,   v6.16b  \n"  /* choose*/             \
+  "fcmge  v2.4s,    v17.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v3.4s,    v17.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v4.4s,    v18.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v5.4s,    v18.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v6.4s,    v19.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v7.4s,    v19.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "bif    v17.16b,   v3.16b,   v2.16b  \n" /* choose*/             \
+  "bif    v18.16b,   v5.16b,   v4.16b  \n" /* choose*/             \
+  "bif    v19.16b,  v7.16b,   v6.16b  \n"  /* choose*/             \
+  "fcmge  v2.4s,    v20.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v3.4s,    v20.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v4.4s,    v21.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v5.4s,    v21.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v6.4s,    v22.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v7.4s,    v22.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "bif    v20.16b,   v3.16b,   v2.16b  \n" /* choose*/             \
+  "bif    v21.16b,   v5.16b,   v4.16b  \n" /* choose*/             \
+  "bif    v22.16b,  v7.16b,   v6.16b  \n"  /* choose*/             \
+  "fcmge  v2.4s,    v23.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v3.4s,    v23.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v4.4s,    v24.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v5.4s,    v24.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v6.4s,    v25.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v7.4s,    v25.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "bif    v23.16b,   v3.16b,   v2.16b  \n" /* choose*/             \
+  "bif    v24.16b,   v5.16b,   v4.16b  \n" /* choose*/             \
+  "bif    v25.16b,  v7.16b,   v6.16b  \n"  /* choose*/             \
+  "fcmge  v2.4s,    v26.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v3.4s,    v26.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v4.4s,    v27.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v5.4s,    v27.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v6.4s,    v28.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v7.4s,    v28.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "bif    v26.16b,   v3.16b,   v2.16b  \n" /* choose*/             \
+  "bif    v27.16b,   v5.16b,   v4.16b  \n" /* choose*/             \
+  "bif    v28.16b,  v7.16b,   v6.16b  \n"  /* choose*/             \
+  "fcmge  v2.4s,    v29.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v3.4s,    v29.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v4.4s,    v30.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v5.4s,    v30.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v6.4s,    v31.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v7.4s,    v31.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "bif    v29.16b,   v3.16b,   v2.16b  \n" /* choose*/             \
+  "bif    v30.16b,   v5.16b,   v4.16b  \n" /* choose*/             \
+  "bif    v31.16b,  v7.16b,   v6.16b  \n"  /* choose*/             \
+  "b      9f                    \n"
+
+#define SPARSE_INT8_INT8_W32_V8_SEMI2_LEAKY_RELU                    \
+  /* do relu */                                                     \
+  "11: \n"                                                          \
+  "cmp   %w[vflag_act],  #3       \n"       /* check leakey relu */ \
+  "bne   12f                     \n"        /* no act end */        \
+  "movi   v0.4s, #0\n"                      /* for relu6 */         \
+  "dup    v1.4s,  %w[valpha]\n"             /* leakey relu alpha */ \
+  "fcmge  v2.4s,    v16.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v3.4s,    v16.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v4.4s,    v17.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v5.4s,    v17.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v6.4s,    v18.4s,   v0.4s   \n"   /* vcgeq_f32 */         \
+  "fmul   v7.4s,    v18.4s,   v1.4s   \n"   /* vmulq_f32 */         \
+  "fcmge  v8.4s,    v19.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v9.4s,    v19.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v10.4s,    v20.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v11.4s,    v20.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v12.4s,    v21.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v13.4s,    v21.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v14.4s,    v22.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v15.4s,    v22.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "bif    v16.16b,   v3.16b,   v2.16b  \n"  /* choose*/             \
+  "bif    v17.16b,   v5.16b,   v4.16b  \n"  /* choose*/             \
+  "bif    v18.16b,  v7.16b,   v6.16b  \n"   /* choose*/             \
+  "bif    v19.16b,  v9.16b,   v8.16b  \n"   /* choose*/             \
+  "bif    v20.16b,  v11.16b,   v10.16b  \n" /* choose*/             \
+  "bif    v21.16b,  v13.16b,   v12.16b  \n" /* choose*/             \
+  "bif    v22.16b,  v15.16b,   v14.16b  \n" /* choose*/             \
+  "fcmge  v2.4s,    v23.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v3.4s,    v23.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v4.4s,    v24.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v5.4s,    v24.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v6.4s,    v25.4s,   v0.4s   \n"   /* vcgeq_f32 */         \
+  "fmul   v7.4s,    v25.4s,   v1.4s   \n"   /* vmulq_f32 */         \
+  "fcmge  v8.4s,    v26.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v9.4s,    v26.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v10.4s,    v27.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v11.4s,    v27.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v12.4s,    v28.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v13.4s,    v28.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v14.4s,    v29.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v15.4s,    v29.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "bif    v23.16b,   v3.16b,   v2.16b  \n"  /* choose*/             \
+  "bif    v24.16b,   v5.16b,   v4.16b  \n"  /* choose*/             \
+  "bif    v25.16b,  v7.16b,   v6.16b  \n"   /* choose*/             \
+  "bif    v26.16b,  v9.16b,   v8.16b  \n"   /* choose*/             \
+  "bif    v27.16b,  v11.16b,   v10.16b  \n" /* choose*/             \
+  "bif    v28.16b,  v13.16b,   v12.16b  \n" /* choose*/             \
+  "bif    v29.16b,  v15.16b,   v14.16b  \n" /* choose*/             \
+  "fcmge  v2.4s,    v30.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v3.4s,    v30.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v4.4s,    v31.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v5.4s,    v31.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "bif    v30.16b,   v3.16b,   v2.16b  \n"  /* choose*/             \
+  "bif    v31.16b,   v5.16b,   v4.16b  \n"  /* choose*/             \
+  "b      9f                    \n"
+
+#define SPARSE_INT8_INT8_W32_V8_SEMI2_HARD_SWISH                 \
+  "12: \n"                                                       \
+  "movi   v0.4s,    #0                \n"    /* for hardswish */ \
+  "ldr    q1,  [%[hs_param], #0]         \n" /* offset */        \
+  "ldr    q2,  [%[hs_param], #16]        \n" /* scale */         \
+  "ldr    q3,  [%[hs_param], #32]        \n" /* threshold */     \
+  "fadd   v4.4s,  v16.4s, v1.4s        \n"                       \
+  "fadd   v6.4s,  v17.4s, v1.4s        \n"                       \
+  "fadd   v8.4s,  v18.4s, v1.4s        \n"                       \
+  "fadd   v10.4s,  v19.4s, v1.4s        \n"                      \
+  "fadd   v12.4s,  v20.4s, v1.4s        \n"                      \
+  "fadd   v14.4s,  v21.4s, v1.4s        \n"                      \
+  "fmul   v5.4s,   v16.4s, v2.4s        \n"                      \
+  "fmul   v7.4s,   v17.4s, v2.4s        \n"                      \
+  "fmul   v9.4s,   v18.4s, v2.4s        \n"                      \
+  "fmul   v11.4s,  v19.4s, v2.4s        \n"                      \
+  "fmul   v13.4s,  v20.4s, v2.4s        \n"                      \
+  "fmul   v15.4s,  v21.4s, v2.4s        \n"                      \
+  "fmax   v4.4s,  v4.4s, v0.4s          \n"                      \
+  "fmax   v6.4s,  v6.4s, v0.4s          \n"                      \
+  "fmax   v8.4s,  v8.4s, v0.4s          \n"                      \
+  "fmax   v10.4s, v10.4s, v0.4s         \n"                      \
+  "fmax   v12.4s, v12.4s, v0.4s         \n"                      \
+  "fmax   v14.4s, v14.4s, v0.4s         \n"                      \
+  "fmin   v4.4s,  v4.4s, v3.4s          \n"                      \
+  "fmin   v6.4s,  v6.4s, v3.4s          \n"                      \
+  "fmin   v8.4s,  v8.4s, v3.4s          \n"                      \
+  "fmin   v10.4s, v10.4s, v3.4s         \n"                      \
+  "fmin   v12.4s, v12.4s, v3.4s         \n"                      \
+  "fmin   v14.4s, v14.4s, v3.4s         \n"                      \
+  "fmul   v16.4s,  v5.4s,  v4.4s        \n"                      \
+  "fmul   v17.4s,  v7.4s,  v6.4s        \n"                      \
+  "fmul   v18.4s,  v9.4s,  v8.4s        \n"                      \
+  "fmul   v19.4s,  v11.4s, v10.4s        \n"                     \
+  "fmul   v20.4s,  v13.4s, v12.4s        \n"                     \
+  "fmul   v21.4s,  v15.4s, v14.4s        \n"                     \
+  "fadd   v4.4s,  v22.4s, v1.4s        \n"                       \
+  "fadd   v6.4s,  v23.4s, v1.4s        \n"                       \
+  "fadd   v8.4s,  v24.4s, v1.4s        \n"                       \
+  "fadd   v10.4s,  v25.4s, v1.4s        \n"                      \
+  "fadd   v12.4s,  v26.4s, v1.4s        \n"                      \
+  "fadd   v14.4s,  v27.4s, v1.4s        \n"                      \
+  "fmul   v5.4s,   v22.4s, v2.4s        \n"                      \
+  "fmul   v7.4s,   v23.4s, v2.4s        \n"                      \
+  "fmul   v9.4s,   v24.4s, v2.4s        \n"                      \
+  "fmul   v11.4s,  v25.4s, v2.4s        \n"                      \
+  "fmul   v13.4s,  v26.4s, v2.4s        \n"                      \
+  "fmul   v15.4s,  v27.4s, v2.4s        \n"                      \
+  "fmax   v4.4s,  v4.4s, v0.4s          \n"                      \
+  "fmax   v6.4s,  v6.4s, v0.4s          \n"                      \
+  "fmax   v8.4s,  v8.4s, v0.4s          \n"                      \
+  "fmax   v10.4s, v10.4s, v0.4s         \n"                      \
+  "fmax   v12.4s, v12.4s, v0.4s         \n"                      \
+  "fmax   v14.4s, v14.4s, v0.4s         \n"                      \
+  "fmin   v4.4s,  v4.4s, v3.4s          \n"                      \
+  "fmin   v6.4s,  v6.4s, v3.4s          \n"                      \
+  "fmin   v8.4s,  v8.4s, v3.4s          \n"                      \
+  "fmin   v10.4s, v10.4s, v3.4s         \n"                      \
+  "fmin   v12.4s, v12.4s, v3.4s         \n"                      \
+  "fmin   v14.4s, v14.4s, v3.4s         \n"                      \
+  "fmul   v22.4s,  v5.4s,  v4.4s        \n"                      \
+  "fmul   v23.4s,  v7.4s,  v6.4s        \n"                      \
+  "fmul   v24.4s,  v9.4s,  v8.4s        \n"                      \
+  "fmul   v25.4s,  v11.4s, v10.4s        \n"                     \
+  "fmul   v26.4s,  v13.4s, v12.4s        \n"                     \
+  "fmul   v27.4s,  v15.4s, v14.4s        \n"                     \
+  "fadd   v4.4s,  v28.4s, v1.4s        \n"                       \
+  "fadd   v6.4s,  v29.4s, v1.4s        \n"                       \
+  "fadd   v8.4s,  v30.4s, v1.4s        \n"                       \
+  "fadd   v10.4s,  v31.4s, v1.4s        \n"                      \
+  "fmul   v5.4s,   v28.4s, v2.4s        \n"                      \
+  "fmul   v7.4s,   v29.4s, v2.4s        \n"                      \
+  "fmul   v9.4s,   v30.4s, v2.4s        \n"                      \
+  "fmul   v11.4s,  v31.4s, v2.4s        \n"                      \
+  "fmax   v4.4s,  v4.4s, v0.4s          \n"                      \
+  "fmax   v6.4s,  v6.4s, v0.4s          \n"                      \
+  "fmax   v8.4s,  v8.4s, v0.4s          \n"                      \
+  "fmax   v10.4s, v10.4s, v0.4s         \n"                      \
+  "fmin   v4.4s,  v4.4s, v3.4s          \n"                      \
+  "fmin   v6.4s,  v6.4s, v3.4s          \n"                      \
+  "fmin   v8.4s,  v8.4s, v3.4s          \n"                      \
+  "fmin   v10.4s, v10.4s, v3.4s         \n"                      \
+  "fmul   v28.4s,  v5.4s,  v4.4s        \n"                      \
+  "fmul   v29.4s,  v7.4s,  v6.4s        \n"                      \
+  "fmul   v30.4s,  v9.4s,  v8.4s        \n"                      \
+  "fmul   v31.4s,  v11.4s, v10.4s        \n"                     \
+  "9:\n"
+
+#define SPARSE_INT8_INT8_W16_V8_SEMI2_LEAKY_RELU                    \
+  /* do relu */                                                     \
+  "11: \n"                                                          \
+  "cmp   %w[vflag_act],  #3       \n"       /* check leakey relu */ \
+  "bne   12f                     \n"        /* no act end */        \
+  "movi   v0.4s, #0\n"                      /* for relu6 */         \
+  "dup    v1.4s,  %w[valpha]\n"             /* leakey relu alpha */ \
+  "fcmge  v2.4s,    v24.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v3.4s,    v24.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v4.4s,    v25.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v5.4s,    v25.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v6.4s,    v26.4s,   v0.4s   \n"   /* vcgeq_f32 */         \
+  "fmul   v7.4s,    v26.4s,   v1.4s   \n"   /* vmulq_f32 */         \
+  "fcmge  v8.4s,    v27.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v9.4s,    v27.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v10.4s,   v28.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v11.4s,   v28.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v12.4s,   v29.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v13.4s,   v29.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v14.4s,   v30.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v15.4s,   v30.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v16.4s,   v31.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v17.4s,   v31.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "bif    v24.16b,  v3.16b,    v2.16b  \n"  /* choose*/             \
+  "bif    v25.16b,  v5.16b,    v4.16b  \n"  /* choose*/             \
+  "bif    v26.16b,  v7.16b,    v6.16b  \n"  /* choose*/             \
+  "bif    v27.16b,  v9.16b,    v8.16b  \n"  /* choose*/             \
+  "bif    v28.16b,  v11.16b,   v10.16b  \n" /* choose*/             \
+  "bif    v29.16b,  v13.16b,   v12.16b  \n" /* choose*/             \
+  "bif    v30.16b,  v15.16b,   v14.16b  \n" /* choose*/             \
+  "bif    v31.16b,  v17.16b,   v16.16b  \n" /* choose*/             \
+  "b      9f                    \n"
+
+#define SPARSE_INT8_INT8_W16_V8_SEMI2_HARD_SWISH                 \
+  "12: \n"                                                       \
+  "movi   v0.4s,    #0                \n"    /* for hardswish */ \
+  "ldr    q1,  [%[hs_param], #0]         \n" /* offset */        \
+  "ldr    q2,  [%[hs_param], #16]        \n" /* scale */         \
+  "ldr    q3,  [%[hs_param], #32]        \n" /* threshold */     \
+  "fadd   v4.4s,  v24.4s, v1.4s        \n"                       \
+  "fadd   v6.4s,  v25.4s, v1.4s        \n"                       \
+  "fadd   v8.4s,  v26.4s, v1.4s        \n"                       \
+  "fadd   v10.4s,  v27.4s, v1.4s        \n"                      \
+  "fadd   v12.4s,  v28.4s, v1.4s        \n"                      \
+  "fadd   v14.4s,  v29.4s, v1.4s        \n"                      \
+  "fadd   v16.4s,  v30.4s, v1.4s        \n"                      \
+  "fadd   v18.4s,  v31.4s, v1.4s        \n"                      \
+  "fmul   v5.4s,   v24.4s, v2.4s        \n"                      \
+  "fmul   v7.4s,   v25.4s, v2.4s        \n"                      \
+  "fmul   v9.4s,   v26.4s, v2.4s        \n"                      \
+  "fmul   v11.4s,  v27.4s, v2.4s        \n"                      \
+  "fmul   v13.4s,  v28.4s, v2.4s        \n"                      \
+  "fmul   v15.4s,  v29.4s, v2.4s        \n"                      \
+  "fmul   v17.4s,  v30.4s, v2.4s        \n"                      \
+  "fmul   v19.4s,  v31.4s, v2.4s        \n"                      \
+  "fmax   v4.4s,  v4.4s, v0.4s          \n"                      \
+  "fmax   v6.4s,  v6.4s, v0.4s          \n"                      \
+  "fmax   v8.4s,  v8.4s, v0.4s          \n"                      \
+  "fmax   v10.4s, v10.4s, v0.4s         \n"                      \
+  "fmax   v12.4s, v12.4s, v0.4s         \n"                      \
+  "fmax   v14.4s, v14.4s, v0.4s         \n"                      \
+  "fmax   v16.4s, v16.4s, v0.4s         \n"                      \
+  "fmax   v18.4s, v18.4s, v0.4s         \n"                      \
+  "fmin   v4.4s,  v4.4s, v3.4s          \n"                      \
+  "fmin   v6.4s,  v6.4s, v3.4s          \n"                      \
+  "fmin   v8.4s,  v8.4s, v3.4s          \n"                      \
+  "fmin   v10.4s, v10.4s, v3.4s         \n"                      \
+  "fmin   v12.4s, v12.4s, v3.4s         \n"                      \
+  "fmin   v14.4s, v14.4s, v3.4s         \n"                      \
+  "fmin   v16.4s, v16.4s, v3.4s         \n"                      \
+  "fmin   v18.4s, v18.4s, v3.4s         \n"                      \
+  "fmul   v24.4s,  v5.4s,  v4.4s        \n"                      \
+  "fmul   v25.4s,  v7.4s,  v6.4s        \n"                      \
+  "fmul   v26.4s,  v9.4s,  v8.4s        \n"                      \
+  "fmul   v27.4s,  v11.4s, v10.4s        \n"                     \
+  "fmul   v28.4s,  v13.4s, v12.4s        \n"                     \
+  "fmul   v29.4s,  v15.4s, v14.4s        \n"                     \
+  "fmul   v30.4s,  v17.4s, v16.4s        \n"                     \
+  "fmul   v31.4s,  v19.4s, v18.4s        \n"                     \
+  "9:\n"
+
+#define SPARSE_INT8_INT8_W8_V8_SEMI2_LEAKY_RELU                     \
+  /* do relu */                                                     \
+  "11: \n"                                                          \
+  "cmp   %w[vflag_act],  #3       \n"       /* check leakey relu */ \
+  "bne   12f                     \n"        /* no act end */        \
+  "movi   v0.4s, #0\n"                      /* for relu6 */         \
+  "dup    v1.4s,  %w[valpha]\n"             /* leakey relu alpha */ \
+  "fcmge  v10.4s,   v28.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v11.4s,   v28.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v12.4s,   v29.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v13.4s,   v29.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v14.4s,   v30.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v15.4s,   v30.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v16.4s,   v31.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v17.4s,   v31.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "bif    v28.16b,  v11.16b,   v10.16b  \n" /* choose*/             \
+  "bif    v29.16b,  v13.16b,   v12.16b  \n" /* choose*/             \
+  "bif    v30.16b,  v15.16b,   v14.16b  \n" /* choose*/             \
+  "bif    v31.16b,  v17.16b,   v16.16b  \n" /* choose*/             \
+  "b      9f                    \n"
+
+#define SPARSE_INT8_INT8_W8_V8_SEMI2_HARD_SWISH                  \
+  "12: \n"                                                       \
+  "movi   v0.4s,    #0                \n"    /* for hardswish */ \
+  "ldr    q1,  [%[hs_param], #0]         \n" /* offset */        \
+  "ldr    q2,  [%[hs_param], #16]        \n" /* scale */         \
+  "ldr    q3,  [%[hs_param], #32]        \n" /* threshold */     \
+  "fadd   v4.4s,  v28.4s, v1.4s        \n"                       \
+  "fadd   v6.4s,  v29.4s, v1.4s        \n"                       \
+  "fadd   v8.4s,  v30.4s, v1.4s        \n"                       \
+  "fadd   v10.4s,  v31.4s, v1.4s        \n"                      \
+  "fmul   v5.4s,   v28.4s, v2.4s        \n"                      \
+  "fmul   v7.4s,   v29.4s, v2.4s        \n"                      \
+  "fmul   v9.4s,   v30.4s, v2.4s        \n"                      \
+  "fmul   v11.4s,  v31.4s, v2.4s        \n"                      \
+  "fmax   v4.4s,  v4.4s, v0.4s          \n"                      \
+  "fmax   v6.4s,  v6.4s, v0.4s          \n"                      \
+  "fmax   v8.4s,  v8.4s, v0.4s          \n"                      \
+  "fmax   v10.4s, v10.4s, v0.4s         \n"                      \
+  "fmin   v4.4s,  v4.4s, v3.4s          \n"                      \
+  "fmin   v6.4s,  v6.4s, v3.4s          \n"                      \
+  "fmin   v8.4s,  v8.4s, v3.4s          \n"                      \
+  "fmin   v10.4s, v10.4s, v3.4s         \n"                      \
+  "fmul   v28.4s,  v5.4s,  v4.4s        \n"                      \
+  "fmul   v29.4s,  v7.4s,  v6.4s        \n"                      \
+  "fmul   v30.4s,  v9.4s,  v8.4s        \n"                      \
+  "fmul   v31.4s,  v11.4s, v10.4s        \n"                     \
+  "9:\n"
+
+#define SPARSE_INT8_INT8_W4_V8_SEMI2_LEAKY_RELU                     \
+  /* do relu */                                                     \
+  "11: \n"                                                          \
+  "cmp   %w[vflag_act],  #3       \n"       /* check leakey relu */ \
+  "bne   12f                     \n"        /* no act end */        \
+  "movi   v0.4s, #0\n"                      /* for relu6 */         \
+  "dup    v1.4s,  %w[valpha]\n"             /* leakey relu alpha */ \
+  "fcmge  v14.4s,   v30.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v15.4s,   v30.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v16.4s,   v31.4s,    v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v17.4s,   v31.4s,    v1.4s   \n"  /* vmulq_f32 */         \
+  "bif    v30.16b,  v15.16b,   v14.16b  \n" /* choose*/             \
+  "bif    v31.16b,  v17.16b,   v16.16b  \n" /* choose*/             \
+  "b      9f                    \n"
+
+#define SPARSE_INT8_INT8_W4_V8_SEMI2_HARD_SWISH                  \
+  "12: \n"                                                       \
+  "movi   v0.4s,    #0                \n"    /* for hardswish */ \
+  "ldr    q1,  [%[hs_param], #0]         \n" /* offset */        \
+  "ldr    q2,  [%[hs_param], #16]        \n" /* scale */         \
+  "ldr    q3,  [%[hs_param], #32]        \n" /* threshold */     \
+  "fadd   v4.4s,  v30.4s, v1.4s        \n"                       \
+  "fadd   v6.4s,  v31.4s, v1.4s        \n"                       \
+  "fmul   v5.4s,   v30.4s, v2.4s        \n"                      \
+  "fmul   v7.4s,   v31.4s, v2.4s        \n"                      \
+  "fmax   v4.4s,  v4.4s, v0.4s          \n"                      \
+  "fmax   v6.4s,  v6.4s, v0.4s          \n"                      \
+  "fmin   v4.4s,  v4.4s, v3.4s          \n"                      \
+  "fmin   v6.4s,  v6.4s, v3.4s          \n"                      \
+  "fmul   v30.4s,  v5.4s,  v4.4s        \n"                      \
+  "fmul   v31.4s,  v7.4s,  v6.4s        \n"                      \
+  "9:\n"
+
+#define SPARSE_INT8_INT8_W32_V8_SEMI1_LEAKY_RELU                   \
+  /* do relu */                                                    \
+  "11: \n"                                                         \
+  "cmp   %w[vflag_act],  #3       \n"      /* check leakey relu */ \
+  "bne   12f                     \n"       /* no act end */        \
+  "movi   v0.4s, #0\n"                     /* for relu6 */         \
+  "dup    v1.4s,  %w[valpha]\n"            /* leakey relu alpha */ \
+  "fcmge  v2.4s,    v21.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v3.4s,    v21.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v4.4s,    v22.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v5.4s,    v22.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v6.4s,    v23.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v7.4s,    v23.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v8.4s,    v24.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v9.4s,    v24.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "bif    v21.16b,   v3.16b,   v2.16b  \n" /* choose*/             \
+  "bif    v22.16b,   v5.16b,   v4.16b  \n" /* choose*/             \
+  "bif    v23.16b,  v7.16b,   v6.16b  \n"  /* choose*/             \
+  "bif    v24.16b,  v9.16b,   v8.16b  \n"  /* choose*/             \
+  "fcmge  v2.4s,    v25.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v3.4s,    v25.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v4.4s,    v26.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v5.4s,    v26.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v6.4s,    v27.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v7.4s,    v27.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v8.4s,    v28.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v9.4s,    v28.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "bif    v25.16b,   v3.16b,   v2.16b  \n" /* choose*/             \
+  "bif    v26.16b,   v5.16b,   v4.16b  \n" /* choose*/             \
+  "bif    v27.16b,  v7.16b,   v6.16b  \n"  /* choose*/             \
+  "bif    v28.16b,  v9.16b,   v8.16b  \n"  /* choose*/             \
+  "b      9f                    \n"
+
+#define SPARSE_INT8_INT8_W32_V8_SEMI1_HARD_SWISH                 \
+  /* do hard_swish */                                            \
+  "12: \n"                                                       \
+  "movi   v0.4s,    #0                \n"    /* for hardswish */ \
+  "ldr    q1,  [%[hs_param], #0]         \n" /* offset */        \
+  "ldr    q2,  [%[hs_param], #16]        \n" /* scale */         \
+  "ldr    q3,  [%[hs_param], #32]        \n" /* threshold */     \
+  "fadd   v6.4s,  v21.4s, v1.4s        \n"                       \
+  "fadd   v8.4s,  v22.4s, v1.4s        \n"                       \
+  "fadd   v10.4s,  v23.4s, v1.4s        \n"                      \
+  "fadd   v12.4s,  v24.4s, v1.4s        \n"                      \
+  "fadd   v14.4s,  v25.4s, v1.4s        \n"                      \
+  "fadd   v16.4s,  v26.4s, v1.4s        \n"                      \
+  "fadd   v18.4s,  v27.4s, v1.4s        \n"                      \
+  "fadd   v30.4s,  v28.4s, v1.4s        \n"                      \
+  "fmul   v7.4s,   v21.4s, v2.4s        \n"                      \
+  "fmul   v9.4s,   v22.4s, v2.4s        \n"                      \
+  "fmul   v11.4s,  v23.4s, v2.4s        \n"                      \
+  "fmul   v13.4s,  v24.4s, v2.4s        \n"                      \
+  "fmul   v15.4s,  v25.4s, v2.4s        \n"                      \
+  "fmul   v17.4s,  v26.4s, v2.4s        \n"                      \
+  "fmul   v19.4s,  v27.4s, v2.4s        \n"                      \
+  "fmul   v31.4s,  v28.4s, v2.4s        \n"                      \
+  "fmax   v6.4s,  v6.4s, v0.4s        \n"                        \
+  "fmax   v8.4s,  v8.4s, v0.4s        \n"                        \
+  "fmax   v10.4s, v10.4s, v0.4s       \n"                        \
+  "fmax   v12.4s, v12.4s, v0.4s       \n"                        \
+  "fmax   v14.4s, v14.4s, v0.4s       \n"                        \
+  "fmax   v16.4s, v16.4s, v0.4s       \n"                        \
+  "fmax   v18.4s, v18.4s, v0.4s       \n"                        \
+  "fmax   v30.4s, v30.4s, v0.4s       \n"                        \
+  "fmin   v6.4s,  v6.4s, v3.4s        \n"                        \
+  "fmin   v8.4s,  v8.4s, v3.4s        \n"                        \
+  "fmin   v10.4s, v10.4s, v3.4s       \n"                        \
+  "fmin   v12.4s, v12.4s, v3.4s       \n"                        \
+  "fmin   v14.4s, v14.4s, v3.4s       \n"                        \
+  "fmin   v16.4s, v16.4s, v3.4s       \n"                        \
+  "fmin   v18.4s, v18.4s, v3.4s       \n"                        \
+  "fmin   v30.4s, v30.4s, v3.4s       \n"                        \
+  "fmul   v21.4s,  v7.4s,  v6.4s        \n"                      \
+  "fmul   v22.4s,  v9.4s,  v8.4s        \n"                      \
+  "fmul   v23.4s,  v11.4s, v10.4s        \n"                     \
+  "fmul   v24.4s,  v13.4s, v12.4s        \n"                     \
+  "fmul   v25.4s,  v15.4s, v14.4s        \n"                     \
+  "fmul   v26.4s,  v17.4s, v16.4s        \n"                     \
+  "fmul   v27.4s,  v19.4s, v18.4s        \n"                     \
+  "fmul   v28.4s,  v31.4s, v30.4s        \n"                     \
+  "9:\n"
+
+#define SPARSE_INT8_INT8_W16_V8_SEMI1_LEAKY_RELU                   \
+  /* do relu */                                                    \
+  "11: \n"                                                         \
+  "cmp   %w[vflag_act],  #3       \n"      /* check leakey relu */ \
+  "bne   12f                     \n"       /* no act end */        \
+  "movi   v0.4s, #0\n"                     /* for relu6 */         \
+  "dup    v1.4s,  %w[valpha]\n"            /* leakey relu alpha */ \
+  "fcmge  v2.4s,    v21.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v3.4s,    v21.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v4.4s,    v22.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v5.4s,    v22.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v6.4s,    v23.4s,   v0.4s   \n"  /* vcgeq_f32 */         \
+  "fmul   v7.4s,    v23.4s,   v1.4s   \n"  /* vmulq_f32 */         \
+  "fcmge  v8.4s,    v24.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v9.4s,    v24.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "bif    v21.16b,   v3.16b,   v2.16b  \n" /* choose*/             \
+  "bif    v22.16b,   v5.16b,   v4.16b  \n" /* choose*/             \
+  "bif    v23.16b,  v7.16b,   v6.16b  \n"  /* choose*/             \
+  "bif    v24.16b,  v9.16b,   v8.16b  \n"  /* choose*/             \
+  "b      9f                    \n"
+
+#define SPARSE_INT8_INT8_W16_V8_SEMI1_HARD_SWISH                 \
+  "12: \n"                                                       \
+  "movi   v0.4s,    #0                \n"    /* for hardswish */ \
+  "ldr    q1,  [%[hs_param], #0]         \n" /* offset */        \
+  "ldr    q2,  [%[hs_param], #16]        \n" /* scale */         \
+  "ldr    q3,  [%[hs_param], #32]        \n" /* threshold */     \
+  "fadd   v6.4s,  v21.4s, v1.4s        \n"                       \
+  "fadd   v8.4s,  v22.4s, v1.4s        \n"                       \
+  "fadd   v10.4s,  v23.4s, v1.4s        \n"                      \
+  "fadd   v12.4s,  v24.4s, v1.4s        \n"                      \
+  "fmul   v7.4s,   v21.4s, v2.4s        \n"                      \
+  "fmul   v9.4s,   v22.4s, v2.4s        \n"                      \
+  "fmul   v11.4s,  v23.4s, v2.4s        \n"                      \
+  "fmul   v13.4s,  v24.4s, v2.4s        \n"                      \
+  "fmax   v6.4s,  v6.4s, v0.4s        \n"                        \
+  "fmax   v8.4s,  v8.4s, v0.4s        \n"                        \
+  "fmax   v10.4s, v10.4s, v0.4s       \n"                        \
+  "fmax   v12.4s, v12.4s, v0.4s       \n"                        \
+  "fmin   v6.4s,  v6.4s, v3.4s        \n"                        \
+  "fmin   v8.4s,  v8.4s, v3.4s        \n"                        \
+  "fmin   v10.4s, v10.4s, v3.4s       \n"                        \
+  "fmin   v12.4s, v12.4s, v3.4s       \n"                        \
+  "fmul   v21.4s,  v7.4s,  v6.4s        \n"                      \
+  "fmul   v22.4s,  v9.4s,  v8.4s        \n"                      \
+  "fmul   v23.4s,  v11.4s, v10.4s        \n"                     \
+  "fmul   v24.4s,  v13.4s, v12.4s        \n"                     \
+  "9:\n"
+
+#define SPARSE_INT8_INT8_W8_V8_SEMI1_LEAKY_RELU                    \
+  /* do relu */                                                    \
+  "11: \n"                                                         \
+  "cmp   %w[vflag_act],  #3       \n"      /* check leakey relu */ \
+  "bne   12f                     \n"       /* no act end */        \
+  "movi   v0.4s, #0\n"                     /* for relu6 */         \
+  "dup    v1.4s,  %w[valpha]\n"            /* leakey relu alpha */ \
+  "fcmge  v2.4s,    v21.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v3.4s,    v21.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "fcmge  v4.4s,    v22.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v5.4s,    v22.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "bif    v21.16b,   v3.16b,   v2.16b  \n" /* choose*/             \
+  "bif    v22.16b,   v5.16b,   v4.16b  \n" /* choose*/             \
+  "b      9f                    \n"
+
+#define SPARSE_INT8_INT8_W8_V8_SEMI1_HARD_SWISH                  \
+  "12: \n"                                                       \
+  "movi   v0.4s,    #0                \n"    /* for hardswish */ \
+  "ldr    q1,  [%[hs_param], #0]         \n" /* offset */        \
+  "ldr    q2,  [%[hs_param], #16]        \n" /* scale */         \
+  "ldr    q3,  [%[hs_param], #32]        \n" /* threshold */     \
+  "fadd   v6.4s,  v21.4s, v1.4s        \n"                       \
+  "fadd   v8.4s,  v22.4s, v1.4s        \n"                       \
+  "fmul   v7.4s,   v21.4s, v2.4s        \n"                      \
+  "fmul   v9.4s,   v22.4s, v2.4s        \n"                      \
+  "fmax   v6.4s,  v6.4s, v0.4s        \n"                        \
+  "fmax   v8.4s,  v8.4s, v0.4s        \n"                        \
+  "fmin   v6.4s,  v6.4s, v3.4s        \n"                        \
+  "fmin   v8.4s,  v8.4s, v3.4s        \n"                        \
+  "fmul   v21.4s,  v7.4s,  v6.4s        \n"                      \
+  "fmul   v22.4s,  v9.4s,  v8.4s        \n"                      \
+  "9:\n"
+
+#define SPARSE_INT8_INT8_W4_V8_SEMI1_LEAKY_RELU                    \
+  /* do relu */                                                    \
+  "11: \n"                                                         \
+  "cmp   %w[vflag_act],  #3       \n"      /* check leakey relu */ \
+  "bne   12f                     \n"       /* no act end */        \
+  "movi   v0.4s, #0\n"                     /* for relu6 */         \
+  "dup    v1.4s,  %w[valpha]\n"            /* leakey relu alpha */ \
+  "fcmge  v2.4s,    v21.4s,    v0.4s   \n" /* vcgeq_f32 */         \
+  "fmul   v3.4s,    v21.4s,    v1.4s   \n" /* vmulq_f32 */         \
+  "bif    v21.16b,   v3.16b,   v2.16b  \n" /* choose*/             \
+  "b      9f                    \n"
+
+#define SPARSE_INT8_INT8_W4_V8_SEMI1_HARD_SWISH                  \
+  "12: \n"                                                       \
+  "movi   v0.4s,    #0                \n"    /* for hardswish */ \
+  "ldr    q1,  [%[hs_param], #0]         \n" /* offset */        \
+  "ldr    q2,  [%[hs_param], #16]        \n" /* scale */         \
+  "ldr    q3,  [%[hs_param], #32]        \n" /* threshold */     \
+  "fadd   v6.4s,  v21.4s, v1.4s        \n"                       \
+  "fmul   v7.4s,   v21.4s, v2.4s        \n"                      \
+  "fmax   v6.4s,  v6.4s, v0.4s        \n"                        \
+  "fmin   v6.4s,  v6.4s, v3.4s        \n"                        \
+  "fmul   v21.4s,  v7.4s,  v6.4s        \n"                      \
+  "9:\n"
+
+#define SPARSE_INT8_INT8_W48_V8_SEMI_OUT                                       \
+  SPARSE_INT8_INT8_W48_V8_SEMI_KERNEL                                          \
+  SPARSE_INT8_INT8_W48_V8_SEMI_RELU                                            \
+  SPARSE_INT8_INT8_W48_V8_SEMI_RELU6                                           \
+  SPARSE_INT8_INT8_W48_V8_SEMI_LEAKY_RELU                                      \
+  /* store result */                                                           \
+  "ld1    {v0.4s},   [%[vmax]] \n" /* v8 = -127 */ /* data >= -127 */          \
+  "fcmge v1.4s,  v8.4s, v0.4s\n"                                               \
+  "fcmge v2.4s,  v9.4s, v0.4s\n"                                               \
+  "fcmge v3.4s,  v10.4s, v0.4s\n"                                              \
+  "fcmge v4.4s,  v11.4s, v0.4s\n"                                              \
+  "fcmge v5.4s,  v12.4s, v0.4s\n"                                              \
+  "fcmge v6.4s,  v13.4s, v0.4s\n"                                              \
+  "fcmge v7.4s,  v14.4s, v0.4s\n"                                              \
+  "bif v8.16b,  v0.16b, v1.16b           \n"                                   \
+  "bif v9.16b,  v0.16b, v2.16b           \n"                                   \
+  "bif v10.16b,  v0.16b, v3.16b           \n"                                  \
+  "bif v11.16b,  v0.16b, v4.16b           \n"                                  \
+  "bif v12.16b,  v0.16b, v5.16b           \n"                                  \
+  "bif v13.16b,  v0.16b, v6.16b           \n"                                  \
+  "bif v14.16b,  v0.16b, v7.16b           \n"                                  \
+  "fcmge v1.4s,  v15.4s, v0.4s\n"                                              \
+  "fcmge v2.4s,  v16.4s, v0.4s\n"                                              \
+  "fcmge v3.4s,  v17.4s, v0.4s\n"                                              \
+  "fcmge v4.4s,  v18.4s, v0.4s\n"                                              \
+  "fcmge v5.4s,  v19.4s, v0.4s\n"                                              \
+  "fcmge v6.4s,  v20.4s, v0.4s\n"                                              \
+  "fcmge v7.4s,  v21.4s, v0.4s\n"                                              \
+  "bif v15.16b,  v0.16b, v1.16b           \n"                                  \
+  "bif v16.16b,  v0.16b, v2.16b           \n"                                  \
+  "bif v17.16b,  v0.16b, v3.16b           \n"                                  \
+  "bif v18.16b,  v0.16b, v4.16b           \n"                                  \
+  "bif v19.16b,  v0.16b, v5.16b           \n"                                  \
+  "bif v20.16b,  v0.16b, v6.16b           \n"                                  \
+  "bif v21.16b,  v0.16b, v7.16b           \n"                                  \
+  "fcmge v1.4s,  v22.4s, v0.4s\n"                                              \
+  "fcmge v2.4s,  v23.4s, v0.4s\n"                                              \
+  "fcmge v3.4s,  v24.4s, v0.4s\n"                                              \
+  "fcmge v4.4s,  v25.4s, v0.4s\n"                                              \
+  "fcmge v5.4s,  v26.4s, v0.4s\n"                                              \
+  "fcmge v6.4s,  v27.4s, v0.4s\n"                                              \
+  "fcmge v7.4s,  v28.4s, v0.4s\n"                                              \
+  "bif v22.16b,  v0.16b, v1.16b           \n"                                  \
+  "bif v23.16b,  v0.16b, v2.16b           \n"                                  \
+  "bif v24.16b,  v0.16b, v3.16b           \n"                                  \
+  "bif v25.16b,  v0.16b, v4.16b           \n"                                  \
+  "bif v26.16b,  v0.16b, v5.16b           \n"                                  \
+  "bif v27.16b,  v0.16b, v6.16b           \n"                                  \
+  "bif v28.16b,  v0.16b, v7.16b           \n"                                  \
+  "fcmge v1.4s,  v29.4s, v0.4s\n"                                              \
+  "fcmge v2.4s,  v30.4s, v0.4s\n"                                              \
+  "fcmge v3.4s,  v31.4s, v0.4s\n"                                              \
+  "bif v29.16b,  v0.16b, v1.16b           \n"                                  \
+  "bif v30.16b,  v0.16b, v2.16b           \n"                                  \
+  "bif v31.16b,  v0.16b, v3.16b           \n"                                  \
+  "fcvtas v0.4s, v8.4s\n"   /*  cvt to int */                                  \
+  "fcvtas v1.4s, v9.4s\n"   /*  cvt to int */                                  \
+  "fcvtas v2.4s, v10.4s\n"  /*  cvt to int */                                  \
+  "fcvtas v3.4s, v11.4s\n"  /*  cvt to int */                                  \
+  "fcvtas v4.4s, v12.4s\n"  /*  cvt to int */                                  \
+  "fcvtas v5.4s, v13.4s\n"  /*  cvt to int */                                  \
+  "fcvtas v6.4s, v14.4s\n"  /*  cvt to int */                                  \
+  "fcvtas v7.4s, v15.4s\n"  /*  cvt to int */                                  \
+  "sqxtn  v8.4h, v0.4s\n"   /*  cvt int32 to int16 */                          \
+  "sqxtn2 v8.8h, v1.4s\n"   /*  cvt int32 to int16 */                          \
+  "sqxtn  v9.4h, v2.4s\n"   /*  cvt int32 to int16 */                          \
+  "sqxtn2 v9.8h, v3.4s\n"   /*  cvt int32 to int16 */                          \
+  "sqxtn  v10.4h, v4.4s\n"  /*  cvt int32 to int16 */                          \
+  "sqxtn2 v10.8h, v5.4s\n"  /*  cvt int32 to int16 */                          \
+  "sqxtn  v11.4h, v6.4s\n"  /*  cvt int32 to int16 */                          \
+  "sqxtn2 v11.8h, v7.4s\n"  /*  cvt int32 to int16 */                          \
+  "sqxtn  v12.8b, v8.8h\n"  /*  cvt int16 to int8 */                           \
+  "sqxtn2 v12.16b,v9.8h\n"  /*  cvt int16 to int8 */                           \
+  "sqxtn  v13.8b, v10.8h\n" /*  cvt int16 to int8 */                           \
+  "sqxtn2 v13.16b,v11.8h\n" /*  cvt int16 to int8 */                           \
+  "fcvtas v0.4s, v16.4s\n"  /*  cvt to int */                                  \
+  "fcvtas v1.4s, v17.4s\n"  /*  cvt to int */                                  \
+  "fcvtas v2.4s, v18.4s\n"  /*  cvt to int */                                  \
+  "fcvtas v3.4s, v19.4s\n"  /*  cvt to int */                                  \
+  "fcvtas v4.4s, v20.4s\n"  /*  cvt to int */                                  \
+  "fcvtas v5.4s, v21.4s\n"  /*  cvt to int */                                  \
+  "stp   q12, q13,  [%[c_ptr1]]\n"                                             \
+  "fcvtas v6.4s, v22.4s\n"   /*  cvt to int */                                 \
+  "fcvtas v7.4s, v23.4s\n"   /*  cvt to int */                                 \
+  "fcvtas v8.4s, v24.4s\n"   /*  cvt to int */                                 \
+  "fcvtas v9.4s, v25.4s\n"   /*  cvt to int */                                 \
+  "fcvtas v10.4s, v26.4s\n"  /*  cvt to int */                                 \
+  "fcvtas v11.4s, v27.4s\n"  /*  cvt to int */                                 \
+  "fcvtas v12.4s, v28.4s\n"  /*  cvt to int */                                 \
+  "fcvtas v13.4s, v29.4s\n"  /*  cvt to int */                                 \
+  "fcvtas v14.4s, v30.4s\n"  /*  cvt to int */                                 \
+  "fcvtas v15.4s, v31.4s\n"  /*  cvt to int */                                 \
+  "sqxtn  v16.4h, v0.4s\n"   /*  cvt int32 to int16 */                         \
+  "sqxtn2 v16.8h, v1.4s\n"   /*  cvt int32 to int16 */                         \
+  "sqxtn  v17.4h, v2.4s\n"   /*  cvt int32 to int16 */                         \
+  "sqxtn2 v17.8h, v3.4s\n"   /*  cvt int32 to int16 */                         \
+  "sqxtn  v18.4h, v4.4s\n"   /*  cvt int32 to int16 */                         \
+  "sqxtn2 v18.8h, v5.4s\n"   /*  cvt int32 to int16 */                         \
+  "sqxtn  v19.4h, v6.4s\n"   /*  cvt int32 to int16 */                         \
+  "sqxtn2 v19.8h, v7.4s\n"   /*  cvt int32 to int16 */                         \
+  "sqxtn  v20.4h, v8.4s\n"   /*  cvt int32 to int16 */                         \
+  "sqxtn2 v20.8h, v9.4s\n"   /*  cvt int32 to int16 */                         \
+  "sqxtn  v21.4h, v10.4s\n"  /*  cvt int32 to int16 */                         \
+  "sqxtn2 v21.8h, v11.4s\n"  /*  cvt int32 to int16 */                         \
+  "sqxtn  v22.4h, v12.4s\n"  /*  cvt int32 to int16 */                         \
+  "sqxtn2 v22.8h, v13.4s\n"  /*  cvt int32 to int16 */                         \
+  "sqxtn  v23.4h, v14.4s\n"  /*  cvt int32 to int16 */                         \
+  "sqxtn2 v23.8h, v15.4s\n"  /*  cvt int32 to int16 */                         \
+  "sqxtn  v24.8b, v16.8h\n"  /*  cvt int16 to int8 */                          \
+  "sqxtn2 v24.16b, v17.8h\n" /*  cvt int16 to int8 */                          \
+  "sqxtn  v25.8b, v18.8h\n"  /*  cvt int16 to int8 */                          \
+  "sqxtn2 v25.16b, v19.8h\n" /*  cvt int16 to int8 */                          \
+  "sqxtn  v26.8b, v20.8h\n"  /*  cvt int16 to int8 */                          \
+  "sqxtn2 v26.16b, v21.8h\n" /*  cvt int16 to int8 */ /* store result */       \
+  "sqxtn  v27.8b, v22.8h\n"                           /*  cvt int16 to int8 */ \
+  "sqxtn2 v27.16b, v23.8h\n"                          /*  cvt int16 to int8 */ \
+  "str   q24,  [%[c_ptr1], #32]\n"                                             \
+  "stp   q25, q26,  [%[c_ptr2]]\n"                                             \
+  "str   q27,  [%[c_ptr2], #32]\n"
+
+#define SPARSE_INT8_INT8_W32_V8_SEMI2_OUT                                      \
+  SPARSE_INT8_INT8_W32_V8_SEMI2_KERNEL                                         \
+  SPARSE_INT8_INT8_W32_V8_SEMI2_RELU                                           \
+  SPARSE_INT8_INT8_W32_V8_SEMI2_RELU6                                          \
+  SPARSE_INT8_INT8_W32_V8_SEMI2_LEAKY_RELU                                     \
+  SPARSE_INT8_INT8_W32_V8_SEMI2_HARD_SWISH                                     \
+  /* store result */                                                           \
+  "ld1    {v15.4s},   [%[vmax]] \n" /* v8 = -127 */ /* data >= -127 */         \
+  "fcmge v0.4s,  v16.4s, v15.4s\n"                                             \
+  "fcmge v1.4s,  v17.4s, v15.4s\n"                                             \
+  "fcmge v2.4s,  v18.4s, v15.4s\n"                                             \
+  "fcmge v3.4s,  v19.4s, v15.4s\n"                                             \
+  "fcmge v4.4s,  v20.4s, v15.4s\n"                                             \
+  "fcmge v5.4s,  v21.4s, v15.4s\n"                                             \
+  "fcmge v6.4s,  v22.4s, v15.4s\n"                                             \
+  "fcmge v7.4s,  v23.4s, v15.4s\n"                                             \
+  "fcmge v8.4s,  v24.4s, v15.4s\n"                                             \
+  "fcmge v9.4s,  v25.4s, v15.4s\n"                                             \
+  "fcmge v10.4s, v26.4s, v15.4s\n"                                             \
+  "fcmge v11.4s, v27.4s, v15.4s\n" /* choose data */                           \
+  "fcmge v12.4s, v28.4s, v15.4s\n"                                             \
+  "fcmge v13.4s, v29.4s, v15.4s\n"                                             \
+  "fcmge v14.4s, v30.4s, v15.4s\n"                                             \
+  "bif v16.16b, v15.16b, v0.16b           \n"                                  \
+  "bif v17.16b, v15.16b, v1.16b           \n"                                  \
+  "bif v18.16b, v15.16b, v2.16b           \n"                                  \
+  "bif v19.16b, v15.16b, v3.16b           \n"                                  \
+  "bif v20.16b, v15.16b, v4.16b           \n"                                  \
+  "bif v21.16b, v15.16b, v5.16b           \n"                                  \
+  "bif v22.16b, v15.16b, v6.16b           \n"                                  \
+  "bif v23.16b, v15.16b, v7.16b           \n"                                  \
+  "bif v24.16b, v15.16b, v8.16b           \n"                                  \
+  "bif v25.16b, v15.16b, v9.16b           \n"                                  \
+  "bif v26.16b, v15.16b, v10.16b          \n"                                  \
+  "bif v27.16b, v15.16b, v11.16b          \n"                                  \
+  "bif v28.16b, v15.16b, v12.16b          \n"                                  \
+  "bif v29.16b, v15.16b, v13.16b          \n"                                  \
+  "bif v30.16b, v15.16b, v14.16b          \n"                                  \
+  "fcmge v0.4s,  v31.4s, v15.4s\n"                                             \
+  "bif v31.16b, v15.16b, v0.16b          \n"                                   \
+  "fcvtas v0.4s, v16.4s\n"   /*  cvt to int */                                 \
+  "fcvtas v1.4s, v17.4s\n"   /*  cvt to int */                                 \
+  "fcvtas v2.4s, v18.4s\n"   /*  cvt to int */                                 \
+  "fcvtas v3.4s, v19.4s\n"   /*  cvt to int */                                 \
+  "fcvtas v4.4s, v20.4s\n"   /*  cvt to int */                                 \
+  "fcvtas v5.4s, v21.4s\n"   /*  cvt to int */                                 \
+  "fcvtas v6.4s, v22.4s\n"   /*  cvt to int */                                 \
+  "fcvtas v7.4s, v23.4s\n"   /*  cvt to int */                                 \
+  "fcvtas v8.4s, v24.4s\n"   /*  cvt to int */                                 \
+  "fcvtas v9.4s, v25.4s\n"   /*  cvt to int */                                 \
+  "fcvtas v10.4s, v26.4s\n"  /*  cvt to int */                                 \
+  "fcvtas v11.4s, v27.4s\n"  /*  cvt to int */                                 \
+  "fcvtas v12.4s, v28.4s\n"  /*  cvt to int */                                 \
+  "fcvtas v13.4s, v29.4s\n"  /*  cvt to int */                                 \
+  "fcvtas v14.4s, v30.4s\n"  /*  cvt to int */                                 \
+  "fcvtas v15.4s, v31.4s\n"  /*  cvt to int */                                 \
+  "sqxtn  v16.4h, v0.4s\n"   /*  cvt int32 to int16 */                         \
+  "sqxtn2 v16.8h, v1.4s\n"   /*  cvt int32 to int16 */                         \
+  "sqxtn  v17.4h, v2.4s\n"   /*  cvt int32 to int16 */                         \
+  "sqxtn2 v17.8h, v3.4s\n"   /*  cvt int32 to int16 */                         \
+  "sqxtn  v18.4h, v4.4s\n"   /*  cvt int32 to int16 */                         \
+  "sqxtn2 v18.8h, v5.4s\n"   /*  cvt int32 to int16 */                         \
+  "sqxtn  v19.4h, v6.4s\n"   /*  cvt int32 to int16 */                         \
+  "sqxtn2 v19.8h, v7.4s\n"   /*  cvt int32 to int16 */                         \
+  "sqxtn  v20.4h, v8.4s\n"   /*  cvt int32 to int16 */                         \
+  "sqxtn2 v20.8h, v9.4s\n"   /*  cvt int32 to int16 */                         \
+  "sqxtn  v21.4h, v10.4s\n"  /*  cvt int32 to int16 */                         \
+  "sqxtn2 v21.8h, v11.4s\n"  /*  cvt int32 to int16 */                         \
+  "sqxtn  v22.4h, v12.4s\n"  /*  cvt int32 to int16 */                         \
+  "sqxtn2 v22.8h, v13.4s\n"  /*  cvt int32 to int16 */                         \
+  "sqxtn  v23.4h, v14.4s\n"  /*  cvt int32 to int16 */                         \
+  "sqxtn2 v23.8h, v15.4s\n"  /*  cvt int32 to int16 */                         \
+  "sqxtn  v24.8b, v16.8h\n"  /*  cvt int16 to int8 */                          \
+  "sqxtn2 v24.16b, v17.8h\n" /*  cvt int16 to int8 */                          \
+  "sqxtn  v25.8b, v18.8h\n"  /*  cvt int16 to int8 */                          \
+  "sqxtn2 v25.16b, v19.8h\n" /*  cvt int16 to int8 */                          \
+  "sqxtn  v26.8b, v20.8h\n"  /*  cvt int16 to int8 */                          \
+  "sqxtn2 v26.16b, v21.8h\n" /*  cvt int16 to int8 */ /* store result */       \
+  "sqxtn  v27.8b, v22.8h\n"                           /*  cvt int16 to int8 */ \
+  "sqxtn2 v27.16b, v23.8h\n"                          /*  cvt int16 to int8 */ \
+  "stp   q24, q25,  [%[c_ptr1]]\n"                                             \
+  "stp   q26, q27,  [%[c_ptr2]]\n"
+
+#define SPARSE_INT8_INT8_W16_V8_SEMI2_OUT                              \
+  SPARSE_INT8_INT8_W16_V8_SEMI2_KERNEL                                 \
+  SPARSE_INT8_INT8_W16_V8_SEMI2_RELU                                   \
+  SPARSE_INT8_INT8_W16_V8_SEMI2_RELU6                                  \
+  SPARSE_INT8_INT8_W16_V8_SEMI2_LEAKY_RELU                             \
+  SPARSE_INT8_INT8_W16_V8_SEMI2_HARD_SWISH                             \
+  /* store result */                                                   \
+  "ld1    {v16.4s},   [%[vmax]] \n" /* v8 = -127 */ /* data >= -127 */ \
+  "fcmge v0.4s,  v24.4s, v16.4s\n"                                     \
+  "fcmge v1.4s,  v25.4s, v16.4s\n"                                     \
+  "fcmge v2.4s,  v26.4s, v16.4s\n"                                     \
+  "fcmge v3.4s,  v27.4s, v16.4s\n"                                     \
+  "fcmge v4.4s,  v28.4s, v16.4s\n"                                     \
+  "fcmge v5.4s,  v29.4s, v16.4s\n"                                     \
+  "fcmge v6.4s,  v30.4s, v16.4s\n"                                     \
+  "fcmge v7.4s,  v31.4s, v16.4s\n"                                     \
+  "bif v24.16b, v16.16b, v0.16b           \n"                          \
+  "bif v25.16b, v16.16b, v1.16b           \n"                          \
+  "bif v26.16b, v16.16b, v2.16b           \n"                          \
+  "bif v27.16b, v16.16b, v3.16b           \n"                          \
+  "bif v28.16b, v16.16b, v4.16b           \n"                          \
+  "bif v29.16b, v16.16b, v5.16b           \n"                          \
+  "bif v30.16b, v16.16b, v6.16b           \n"                          \
+  "bif v31.16b, v16.16b, v7.16b           \n"                          \
+  "fcvtas v0.4s, v24.4s\n"   /*  cvt to int */                         \
+  "fcvtas v1.4s, v25.4s\n"   /*  cvt to int */                         \
+  "fcvtas v2.4s, v26.4s\n"   /*  cvt to int */                         \
+  "fcvtas v3.4s, v27.4s\n"   /*  cvt to int */                         \
+  "fcvtas v4.4s, v28.4s\n"   /*  cvt to int */                         \
+  "fcvtas v5.4s, v29.4s\n"   /*  cvt to int */                         \
+  "fcvtas v6.4s, v30.4s\n"   /*  cvt to int */                         \
+  "fcvtas v7.4s, v31.4s\n"   /*  cvt to int */                         \
+  "sqxtn  v8.4h, v0.4s\n"    /*  cvt int32 to int16 */                 \
+  "sqxtn2 v8.8h, v1.4s\n"    /*  cvt int32 to int16 */                 \
+  "sqxtn  v9.4h, v2.4s\n"    /*  cvt int32 to int16 */                 \
+  "sqxtn2 v9.8h, v3.4s\n"    /*  cvt int32 to int16 */                 \
+  "sqxtn  v10.4h, v4.4s\n"   /*  cvt int32 to int16 */                 \
+  "sqxtn2 v10.8h, v5.4s\n"   /*  cvt int32 to int16 */                 \
+  "sqxtn  v11.4h, v6.4s\n"   /*  cvt int32 to int16 */                 \
+  "sqxtn2 v11.8h, v7.4s\n"   /*  cvt int32 to int16 */                 \
+  "sqxtn  v12.8b, v8.8h\n"   /*  cvt int16 to int8 */                  \
+  "sqxtn2 v12.16b, v9.8h\n"  /*  cvt int16 to int8 */                  \
+  "sqxtn  v13.8b, v10.8h\n"  /*  cvt int16 to int8 */                  \
+  "sqxtn2 v13.16b, v11.8h\n" /*  cvt int16 to int8 */                  \
+  "str   q12,  [%[c_ptr1]]\n"                                          \
+  "str   q13,  [%[c_ptr2]]\n"
+
+#define SPARSE_INT8_INT8_W8_V8_SEMI2_OUT                               \
+  SPARSE_INT8_INT8_W8_V8_SEMI2_KERNEL                                  \
+  SPARSE_INT8_INT8_W8_V8_SEMI2_RELU                                    \
+  SPARSE_INT8_INT8_W8_V8_SEMI2_RELU6                                   \
+  SPARSE_INT8_INT8_W8_V8_SEMI2_LEAKY_RELU                              \
+  SPARSE_INT8_INT8_W8_V8_SEMI2_HARD_SWISH                              \
+  /* store result */                                                   \
+  "ld1    {v16.4s},   [%[vmax]] \n" /* v8 = -127 */ /* data >= -127 */ \
+  "fcmge v4.4s,  v28.4s, v16.4s\n"                                     \
+  "fcmge v5.4s,  v29.4s, v16.4s\n"                                     \
+  "fcmge v6.4s,  v30.4s, v16.4s\n"                                     \
+  "fcmge v7.4s,  v31.4s, v16.4s\n"                                     \
+  "bif v28.16b, v16.16b, v4.16b           \n"                          \
+  "bif v29.16b, v16.16b, v5.16b           \n"                          \
+  "bif v30.16b, v16.16b, v6.16b           \n"                          \
+  "bif v31.16b, v16.16b, v7.16b           \n"                          \
+  "fcvtas v4.4s, v28.4s\n"  /*  cvt to int */                          \
+  "fcvtas v5.4s, v29.4s\n"  /*  cvt to int */                          \
+  "fcvtas v6.4s, v30.4s\n"  /*  cvt to int */                          \
+  "fcvtas v7.4s, v31.4s\n"  /*  cvt to int */                          \
+  "sqxtn  v10.4h, v4.4s\n"  /*  cvt int32 to int16 */                  \
+  "sqxtn2 v10.8h, v5.4s\n"  /*  cvt int32 to int16 */                  \
+  "sqxtn  v11.4h, v6.4s\n"  /*  cvt int32 to int16 */                  \
+  "sqxtn2 v11.8h, v7.4s\n"  /*  cvt int32 to int16 */                  \
+  "sqxtn  v12.8b, v10.8h\n" /*  cvt int16 to int8 */                   \
+  "sqxtn  v13.8b, v11.8h\n" /*  cvt int16 to int8 */                   \
+  "str   d12,  [%[c_ptr1]]\n"                                          \
+  "str   d13,  [%[c_ptr2]]\n"
+
+#define SPARSE_INT8_INT8_W4_V8_SEMI2_OUT                               \
+  SPARSE_INT8_INT8_W4_V8_SEMI2_KERNEL                                  \
+  SPARSE_INT8_INT8_W4_V8_SEMI2_RELU                                    \
+  SPARSE_INT8_INT8_W4_V8_SEMI2_RELU6                                   \
+  SPARSE_INT8_INT8_W4_V8_SEMI2_LEAKY_RELU                              \
+  SPARSE_INT8_INT8_W4_V8_SEMI2_HARD_SWISH                              \
+  /* store result */                                                   \
+  "ld1    {v16.4s},   [%[vmax]] \n" /* v8 = -127 */ /* data >= -127 */ \
+  "fcmge v6.4s,  v30.4s, v16.4s\n"                                     \
+  "fcmge v7.4s,  v31.4s, v16.4s\n"                                     \
+  "bif v30.16b, v16.16b, v6.16b           \n"                          \
+  "bif v31.16b, v16.16b, v7.16b           \n"                          \
+  "fcvtas v6.4s, v30.4s\n"  /*  cvt to int */                          \
+  "fcvtas v7.4s, v31.4s\n"  /*  cvt to int */                          \
+  "sqxtn  v11.4h, v6.4s\n"  /*  cvt int32 to int16 */                  \
+  "sqxtn  v12.4h, v7.4s\n"  /*  cvt int32 to int16 */                  \
+  "sqxtn  v13.8b, v11.8h\n" /*  cvt int16 to int8 */                   \
+  "sqxtn  v14.8b, v12.8h\n" /*  cvt int16 to int8 */                   \
+  "str   s13,  [%[c_ptr1]]\n"                                          \
+  "str   s14,  [%[c_ptr2]]\n"
+
+#define SPARSE_INT8_INT8_W32_V8_SEMI1_OUT                                \
+  SPARSE_INT8_INT8_W32_V8_SEMI1_KERNEL                                   \
+  SPARSE_INT8_INT8_W32_V8_SEMI1_RELU                                     \
+  SPARSE_INT8_INT8_W32_V8_SEMI1_RELU6                                    \
+  SPARSE_INT8_INT8_W32_V8_SEMI1_LEAKY_RELU                               \
+  SPARSE_INT8_INT8_W32_V8_SEMI1_HARD_SWISH                               \
+  /* store result */                                                     \
+  "ld1    {v8.4s},   [%[vmax]] \n" /* v8 = -127 */ /* data >= -127 */    \
+  "fcmge v0.4s, v21.4s, v8.4s\n"                                         \
+  "fcmge v1.4s, v22.4s, v8.4s\n"                                         \
+  "fcmge v2.4s, v23.4s, v8.4s\n"                                         \
+  "fcmge v3.4s, v24.4s, v8.4s\n"                                         \
+  "fcmge v4.4s, v25.4s, v8.4s\n"                                         \
+  "fcmge v5.4s, v26.4s, v8.4s\n"                                         \
+  "fcmge v6.4s, v27.4s, v8.4s\n"                                         \
+  "fcmge v7.4s, v28.4s, v8.4s\n" /* choose data */                       \
+  "bif v21.16b,  v8.16b, v0.16b           \n"                            \
+  "bif v22.16b, v8.16b, v1.16b            \n"                            \
+  "bif v23.16b, v8.16b, v2.16b            \n"                            \
+  "bif v24.16b, v8.16b, v3.16b            \n"                            \
+  "bif v25.16b, v8.16b, v4.16b            \n"                            \
+  "bif v26.16b, v8.16b, v5.16b            \n"                            \
+  "bif v27.16b, v8.16b, v6.16b            \n"                            \
+  "bif v28.16b, v8.16b, v7.16b            \n"                            \
+  "fcvtas v0.4s, v21.4s\n"   /*  cvt to int */                           \
+  "fcvtas v1.4s, v22.4s\n"   /*  cvt to int */                           \
+  "fcvtas v2.4s, v23.4s\n"   /*  cvt to int */                           \
+  "fcvtas v3.4s, v24.4s\n"   /*  cvt to int */                           \
+  "fcvtas v4.4s, v25.4s\n"   /*  cvt to int */                           \
+  "fcvtas v5.4s, v26.4s\n"   /*  cvt to int */                           \
+  "fcvtas v6.4s, v27.4s\n"   /*  cvt to int */                           \
+  "fcvtas v7.4s, v28.4s\n"   /*  cvt to int */                           \
+  "sqxtn  v16.4h, v0.4s\n"   /*  cvt int32 to int16 */                   \
+  "sqxtn2 v16.8h, v1.4s\n"   /*  cvt int32 to int16 */                   \
+  "sqxtn  v17.4h, v2.4s\n"   /*  cvt int32 to int16 */                   \
+  "sqxtn2 v17.8h, v3.4s\n"   /*  cvt int32 to int16 */                   \
+  "sqxtn  v18.4h, v4.4s\n"   /*  cvt int32 to int16 */                   \
+  "sqxtn2 v18.8h, v5.4s\n"   /*  cvt int32 to int16 */                   \
+  "sqxtn  v19.4h, v6.4s\n"   /*  cvt int32 to int16 */                   \
+  "sqxtn2 v19.8h, v7.4s\n"   /*  cvt int32 to int16 */                   \
+  "sqxtn  v21.8b, v16.8h\n"  /*  cvt int16 to int8 */                    \
+  "sqxtn2 v21.16b, v17.8h\n" /*  cvt int16 to int8 */                    \
+  "sqxtn  v22.8b, v18.8h\n"  /*  cvt int16 to int8 */                    \
+  "sqxtn2 v22.16b, v19.8h\n" /*  cvt int16 to int8 */ /* store result */ \
+  "stp   q21, q22,  [%[c_ptr]]\n"
+
+#define SPARSE_INT8_INT8_W16_V8_SEMI1_OUT                                \
+  SPARSE_INT8_INT8_W16_V8_SEMI1_KERNEL                                   \
+  SPARSE_INT8_INT8_W16_V8_SEMI1_RELU                                     \
+  SPARSE_INT8_INT8_W16_V8_SEMI1_RELU6                                    \
+  SPARSE_INT8_INT8_W16_V8_SEMI1_LEAKY_RELU                               \
+  SPARSE_INT8_INT8_W16_V8_SEMI1_HARD_SWISH                               \
+  "ld1    {v8.4s},   [%[vmax]] \n" /* v8 = -127 */ /* data >= -127 */    \
+  "fcmge v0.4s, v21.4s, v8.4s\n"                                         \
+  "fcmge v1.4s, v22.4s, v8.4s\n"                                         \
+  "fcmge v2.4s, v23.4s, v8.4s\n"                                         \
+  "fcmge v3.4s, v24.4s, v8.4s\n" /* choose data */                       \
+  "bif v21.16b,  v8.16b, v0.16b           \n"                            \
+  "bif v22.16b, v8.16b, v1.16b            \n"                            \
+  "bif v23.16b, v8.16b, v2.16b            \n"                            \
+  "bif v24.16b, v8.16b, v3.16b            \n"                            \
+  "fcvtas v0.4s, v21.4s\n"  /*  cvt to int */                            \
+  "fcvtas v1.4s, v22.4s\n"  /*  cvt to int */                            \
+  "fcvtas v2.4s, v23.4s\n"  /*  cvt to int */                            \
+  "fcvtas v3.4s, v24.4s\n"  /*  cvt to int */                            \
+  "sqxtn  v16.4h, v0.4s\n"  /*  cvt int32 to int16 */                    \
+  "sqxtn2 v16.8h, v1.4s\n"  /*  cvt int32 to int16 */                    \
+  "sqxtn  v17.4h, v2.4s\n"  /*  cvt int32 to int16 */                    \
+  "sqxtn2 v17.8h, v3.4s\n"  /*  cvt int32 to int16 */                    \
+  "sqxtn  v21.8b, v16.8h\n" /*  cvt int16 to int8 */                     \
+  "sqxtn2 v21.16b, v17.8h\n" /*  cvt int16 to int8 */ /* store result */ \
+  "str   q21,  [%[c_ptr]]\n"
+
+#define SPARSE_INT8_INT8_W8_V8_SEMI1_OUT                                       \
+  SPARSE_INT8_INT8_W8_V8_SEMI1_KERNEL                                          \
+  SPARSE_INT8_INT8_W8_V8_SEMI1_RELU                                            \
+  SPARSE_INT8_INT8_W8_V8_SEMI1_RELU6                                           \
+  SPARSE_INT8_INT8_W8_V8_SEMI1_LEAKY_RELU                                      \
+  SPARSE_INT8_INT8_W8_V8_SEMI1_HARD_SWISH                                      \
+  "ld1    {v8.4s},   [%[vmax]] \n" /* v8 = -127 */ /* data >= -127 */          \
+  "fcmge v0.4s, v21.4s, v8.4s\n"                                               \
+  "fcmge v1.4s, v22.4s, v8.4s\n" /* choose data */                             \
+  "bif v21.16b,  v8.16b, v0.16b            \n"                                 \
+  "bif v22.16b, v8.16b, v1.16b            \n"                                  \
+  "fcvtas v0.4s, v21.4s\n"                           /*  cvt to int */         \
+  "fcvtas v1.4s, v22.4s\n"                           /*  cvt to int */         \
+  "sqxtn  v16.4h, v0.4s\n"                           /*  cvt int32 to int16 */ \
+  "sqxtn2 v16.8h, v1.4s\n"                           /*  cvt int32 to int16 */ \
+  "sqxtn  v21.8b, v16.8h\n" /*  cvt int16 to int8 */ /* store result */        \
+  "str   d21,  [%[c_ptr]]\n"
+
+#define SPARSE_INT8_INT8_W4_V8_SEMI1_OUT                                      \
+  SPARSE_INT8_INT8_W4_V8_SEMI1_KERNEL                                         \
+  SPARSE_INT8_INT8_W4_V8_SEMI1_RELU                                           \
+  SPARSE_INT8_INT8_W4_V8_SEMI1_RELU6                                          \
+  SPARSE_INT8_INT8_W4_V8_SEMI1_LEAKY_RELU                                     \
+  SPARSE_INT8_INT8_W4_V8_SEMI1_HARD_SWISH                                     \
+  "ld1    {v8.4s},   [%[vmax]]  \n" /* v8 = -127 */ /* data >= -127 */        \
+  "fcmge v0.4s, v21.4s, v8.4s   \n"                 /* choose data */         \
+  "bif v21.16b,  v8.16b, v0.16b \n"                                           \
+  "fcvtas v0.4s, v21.4s\n"                          /*  cvt to int */         \
+  "sqxtn  v16.4h, v0.4s\n"                          /*  cvt int32 to int16 */ \
+  "sqxtn  v21.8b, v16.8h\n" /* cvt int16 to int8 */ /* store result */        \
+  "str   s21,  [%[c_ptr]]\n"
+
+#define SET_ASSM_INPUT_PARAM1_V8_INT8_INT8 \
+: [a_ptr] "+r"(cur_w),                          \
+  [b_ptr] "+r"(cur_b),                          \
+  [c_ptr1] "+r"(out_ptr1),                      \
+  [c_ptr2] "+r"(out_ptr2),                      \
+  [k] "+r"(nnz),                                \
+  [widx_dmap] "+r"(dmap),                       \
+  [bias_ptr] "+r"(pbias),                       \
+  [scale_ptr] "+r"(pscale)                      \
+: [vflag_act] "r"(flag_act),                    \
+  [valpha] "r"(alpha),                          \
+  [vmax] "r"(vmax),                             \
+  [hs_param] "r"(hs_param)
+
+#define SET_ASSM_INPUT_PARAM2_V8_INT8_INT8 \
+: [a_ptr] "+r"(cur_w),                          \
+  [b_ptr] "+r"(cur_b),                          \
+  [c_ptr] "+r"(out_ptr),                        \
+  [k] "+r"(nnz),                                \
+  [widx_dmap] "+r"(dmap)                        \
+: [vscale] "r"(vscale),                         \
+  [vbias] "r"(vbias),                           \
+  [vflag_act] "r"(flag_act),                    \
+  [valpha] "r"(alpha),                          \
+  [vmax] "r"(vmax),                             \
+  [hs_param] "r"(hs_param)
+
+/**
+ * \brief Semi-structured Sparse calculation implementation of 1x1 convolution,
+ * both input and output are int8.
+ * Semi-structured Sparse matrix multiplication is calculated in blocks, the
+ * block size is Mx32,
+ * that is,
+ * the parameter matrix is MxK, and the activation matrix is Kx48; when N is
+ * less than 32,
+ * it is calculated in blocks of Mx16, Mx8, and Mx4 in turn;
+ * @param A Semi-structured Sparse weight data
+ * @param B dense input data
+ * @param widx_dmap An array of int32_t values storing scaled [by sizeof(input
+ * element)] difference
+ * between input channels corresponding to successive non-zero element
+ * @param nidx_nnzmap the number of non-zero kernel elements per each output
+ * channel
+ * @param bias
+ * @param output
+ * @param M
+ * @param N
+ * @param K
+ * @param param
+ * @param ctx
+ */
+void sparse_semi_conv_int8_int8_pipelined(
+    const int8_t* A,
+    const int8_t* B,
+    const int32_t* widx_dmap,
+    const uint32_t* nidx_nnzmap,
+    const float* bias,
+    const float* scale,
+    int8_t* output,
+    int M,
+    int K,
+    int N,
+    const operators::SparseConvParam& param,
+    ARMContext* ctx) {
+  auto act_param = param.activation_param;
+  auto act_type = act_param.active_type;
+  volatile float alpha = 0.f;
+  float hs_param[12] = {0.f};
+  int flag_act = 0x00;  // relu: 1, relu6: 2, leakey: 3
+  if (act_param.has_active) {
+    if (act_type == lite_api::ActivationType::kRelu) {
+      flag_act = 0x01;
+    } else if (act_type == lite_api::ActivationType::kRelu6) {
+      flag_act = 0x02;
+      alpha = act_param.Relu_clipped_coef;
+    } else if (act_type == lite_api::ActivationType::kLeakyRelu) {
+      flag_act = 0x03;
+      alpha = act_param.Leaky_relu_alpha;
+    } else if (act_type == lite_api::ActivationType::kHardSwish) {
+      flag_act = 0x04;
+      for (int i = 0; i < 4; i++) {
+        hs_param[i] = act_param.hard_swish_offset / param.output_scale;
+        hs_param[i + 4] = 1.0 / act_param.hard_swish_scale;
+        hs_param[i + 8] = act_param.hard_swish_threshold / param.output_scale;
+      }
+    }
+  }
+  int flag_bias = (bias != nullptr) ? 1 : 0;
+  size_t mc = N * sizeof(int8_t);
+  size_t nc = M;
+  size_t output_stride = N * sizeof(int8_t);
+  size_t output_decrement = output_stride * nc - 32 * sizeof(int8_t);
+  size_t pair_num = M / 2;
+  size_t lave_num = M % 2;
+  float vmax[4] = {-127.0, -127.0, -127.0, -127.0};
+  float bias_zero[2] = {0.f, 0.f};
+  while
+    SPARSE_LIKELY(mc >= 32 * sizeof(int8_t)) {
+      LITE_PARALLEL_COMMON_BEGIN(i, tid, pair_num, 0, 1) {
+        GET_SEMI_PARAM_TABLE(int8_t, int8_t)
+        const float* pscale = scale + i * 2;
+        // clang-format off
+        asm volatile(SPARSE_INT8_INT8_W32_V8_SEMI2_OUT
+            SET_ASSM_INPUT_PARAM1_V8_INT8_INT8
+            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+                "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
+                "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", 
+                "v26", "v27", "v28", "v30", "v31", "w1", "x1", "cc", "memory");
+        // clang-format on
+      }
+      LITE_PARALLEL_COMMON_END();
+      if
+        SPARSE_UNLIKELY(lave_num != 0) {
+          GET_UNSTRUCT_PARAM_TABLE(int8_t, int8_t)
+          float vscale = scale[pair_num * 2];
+          // clang-format off
+        asm volatile(SPARSE_INT8_INT8_W32_V8_SEMI1_OUT
+          SET_ASSM_INPUT_PARAM2_V8_INT8_INT8
+          : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+              "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
+              "v16", "v17", "v18", "v19", "v21", "v22", "v23", "v24", "v25", 
+              "v26", "v27", "v28", "v30", "v31", "w0", "w1", "x1", "cc", "memory");
+          // clang-format on
+        }
+      output =
+          reinterpret_cast<int8_t*>((uintptr_t)output + 32 * sizeof(int8_t));
+      B += 32;
+      mc -= 32 * sizeof(int8_t);
+    }
+  if
+    SPARSE_UNLIKELY(mc != 0) {
+      output_decrement += 16 * sizeof(int8_t);
+      if (mc & (16 * sizeof(int8_t))) {
+        LITE_PARALLEL_COMMON_BEGIN(i, tid, pair_num, 0, 1) {
+          GET_SEMI_PARAM_TABLE(int8_t, int8_t)
+          const float* pscale = scale + i * 2;
+          // clang-format off
+          asm volatile(SPARSE_INT8_INT8_W16_V8_SEMI2_OUT
+              SET_ASSM_INPUT_PARAM1_V8_INT8_INT8
+              : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+                  "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
+                  "v16", "v17", "v18", "v19", "v24", "v25", "v26", "v27", "v28","v29", "v30", "v31", "w1", "x1", "cc", "memory");
+          // clang-format on
+        }
+        LITE_PARALLEL_COMMON_END();
+        if
+          SPARSE_UNLIKELY(lave_num != 0) {
+            GET_UNSTRUCT_PARAM_TABLE(int8_t, int8_t)
+            float vscale = scale[pair_num * 2];
+            // clang-format off
+          asm volatile(SPARSE_INT8_INT8_W16_V8_SEMI1_OUT
+            SET_ASSM_INPUT_PARAM2_V8_INT8_INT8
+            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+              "v8", "v9", "v11", "v12", "v13", "v14", "v16", "v17", "v21", "v22", "v23",
+              "v24", "w1", "x1", "cc", "memory");
+            // clang-format on
+          }
+        output =
+            reinterpret_cast<int8_t*>((uintptr_t)output + 16 * sizeof(int8_t));
+        B += 16;
+        mc -= 16 * sizeof(int8_t);
+      }
+      output_decrement += 8 * sizeof(int8_t);
+      if (mc & (8 * sizeof(int8_t))) {
+        LITE_PARALLEL_COMMON_BEGIN(i, tid, pair_num, 0, 1) {
+          GET_SEMI_PARAM_TABLE(int8_t, int8_t)
+          const float* pscale = scale + i * 2;
+          // clang-format off
+          asm volatile(SPARSE_INT8_INT8_W8_V8_SEMI2_OUT
+              SET_ASSM_INPUT_PARAM1_V8_INT8_INT8
+              : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+                  "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
+                  "v16", "v17", "v28", "v29", "v30", "v31", "w1", "x1", "cc", "memory");
+          // clang-format on
+        }
+        LITE_PARALLEL_COMMON_END();
+        if
+          SPARSE_UNLIKELY(lave_num != 0) {
+            GET_UNSTRUCT_PARAM_TABLE(int8_t, int8_t)
+            float vscale = scale[pair_num * 2];
+            // clang-format off
+          asm volatile(SPARSE_INT8_INT8_W8_V8_SEMI1_OUT
+            SET_ASSM_INPUT_PARAM2_V8_INT8_INT8
+            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v16", "v21", 
+              "v22", "w1", "x1", "cc", "memory");
+            // clang-format on
+          }
+        output =
+            reinterpret_cast<int8_t*>((uintptr_t)output + 8 * sizeof(int8_t));
+        B += 8;
+        mc -= 8 * sizeof(int8_t);
+      }
+      output_decrement += 4 * sizeof(int8_t);
+      if (mc & (4 * sizeof(int8_t))) {
+        LITE_PARALLEL_COMMON_BEGIN(i, tid, pair_num, 0, 1) {
+          GET_SEMI_PARAM_TABLE(int8_t, int8_t)
+          const float* pscale = scale + i * 2;
+          // clang-format off
+          asm volatile(SPARSE_INT8_INT8_W4_V8_SEMI2_OUT
+              SET_ASSM_INPUT_PARAM1_V8_INT8_INT8
+              : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+                  "v8", "v9", "v11", "v12", "v13", "v14", "v15",
+                  "v16", "v17", "v30", "v31", "w1", "w2", "w3", "w4", "w5", "x1", "cc", "memory");
+          // clang-format on
+        }
+        LITE_PARALLEL_COMMON_END();
+        if
+          SPARSE_UNLIKELY(lave_num != 0) {
+            GET_UNSTRUCT_PARAM_TABLE(int8_t, int8_t)
+            float vscale = scale[pair_num * 2];
+            // clang-format off
+          asm volatile(SPARSE_INT8_INT8_W4_V8_SEMI1_OUT
+            SET_ASSM_INPUT_PARAM2_V8_INT8_INT8
+            : "v0", "v1", "v2", "v3", "v4", "v6", "v7", "v8", "v9", "v11", "v16", "v21", 
+              "w1", "w2", "w3", "w4", "w5", "x1", "cc", "memory");
+            // clang-format on
+          }
+        output =
+            reinterpret_cast<int8_t*>((uintptr_t)output + 4 * sizeof(int8_t));
+        B += 4;
+        mc -= 4 * sizeof(int8_t);
+      }
+
+      if
+        SPARSE_UNLIKELY(mc != 0 && mc < 4 * sizeof(int8_t)) {
+          LITE_PARALLEL_COMMON_BEGIN(i, tid, pair_num, 0, 1) {
+            int8_t* out_ptr1 = reinterpret_cast<int8_t*>((uintptr_t)output +
+                                                         output_stride * 2 * i);
+            int8_t* out_ptr2 = reinterpret_cast<int8_t*>(
+                (uintptr_t)output + output_stride * (2 * i + 1));
+            const int8_t* cur_w = A;
+            uint32_t nnz = nidx_nnzmap[i];
+            const int8_t* cur_b = B;
+            const int32_t* dmap = widx_dmap;
+            if (i != 0) {
+              cur_w = A + nidx_nnzmap[i - 1] * 2;
+              nnz = nidx_nnzmap[i] - nidx_nnzmap[i - 1];
+              cur_b +=
+                  ((nidx_nnzmap[i - 1] == 0)
+                       ? 0
+                       : widx_dmap[nidx_nnzmap[i - 1] - 1] / sizeof(int8_t));
+              dmap = widx_dmap + nidx_nnzmap[i - 1];
+            }
+
+            float32x4_t vbias1, vbias2;
+            if (bias != nullptr) {
+              vbias1 = vdupq_n_f32(bias[i * 2]);
+              vbias2 = vdupq_n_f32(bias[i * 2 + 1]);
+            } else {
+              vbias1 = vdupq_n_f32(0);
+              vbias2 = vdupq_n_f32(0);
+            }
+            const float* pscale = scale + i * 2;
+            int32x4_t vacc0123_1 = vdupq_n_s32(0);
+            int32x4_t vacc0123_2 = vdupq_n_s32(0);
+            for (int j = 0; j < nnz; j++) {
+              int8x8_t vi0123 = vdup_n_s8(0);
+              if (mc == 1) {
+                vi0123 = vld1_lane_s8(cur_b, vi0123, 0);
+              } else if (mc == 2) {
+                vi0123 = vld1_lane_s8(cur_b, vi0123, 0);
+                vi0123 = vld1_lane_s8(cur_b + 1, vi0123, 1);
+              } else {
+                vi0123 = vld1_lane_s8(cur_b, vi0123, 0);
+                vi0123 = vld1_lane_s8(cur_b + 1, vi0123, 1);
+                vi0123 = vld1_lane_s8(cur_b + 2, vi0123, 2);
+              }
+              int8x8_t vw1 = vld1_dup_s8(cur_w);
+              cur_w += 1;
+              int8x8_t vw2 = vld1_dup_s8(cur_w);
+              cur_w += 1;
+              intptr_t diff = *dmap++;
+              cur_b = (const int8_t*)((uintptr_t)cur_b + (uintptr_t)diff);
+              vacc0123_1 =
+                  vaddw_s16(vacc0123_1, vget_low_s16(vmull_s8(vi0123, vw1)));
+              vacc0123_2 =
+                  vaddw_s16(vacc0123_2, vget_low_s16(vmull_s8(vi0123, vw2)));
+            }
+            float32x4_t vaccf0123_1 = vcvtq_f32_s32(vacc0123_1);
+            float32x4_t vaccf0123_2 = vcvtq_f32_s32(vacc0123_2);
+            vaccf0123_1 = vmlaq_n_f32(vbias1, vaccf0123_1, pscale[0]);
+            vaccf0123_2 = vmlaq_n_f32(vbias2, vaccf0123_2, pscale[1]);
+            float32x4_t vzero = vdupq_n_f32(0);
+            if (flag_act == 1) {
+              vaccf0123_1 = vmaxq_f32(vaccf0123_1, vzero);
+              vaccf0123_2 = vmaxq_f32(vaccf0123_2, vzero);
+            } else if (flag_act == 0) {
+            } else if (flag_act == 2) {
+              float32x4_t aph = vdupq_n_f32(alpha);
+              vaccf0123_1 = vmaxq_f32(vaccf0123_1, vzero);
+              vaccf0123_1 = vminq_f32(vaccf0123_1, aph);
+              vaccf0123_2 = vmaxq_f32(vaccf0123_2, vzero);
+              vaccf0123_2 = vminq_f32(vaccf0123_2, aph);
+            } else if (flag_act == 3) {
+              float32x4_t aph = vdupq_n_f32(alpha);
+              uint32x4_t vflag0123_1 = vcgeq_f32(vaccf0123_1, vzero);
+              float32x4_t v0123_1 = vmulq_f32(vaccf0123_1, aph);
+              vaccf0123_1 = vbslq_f32(vflag0123_1, vaccf0123_1, v0123_1);
+              uint32x4_t vflag0123_2 = vcgeq_f32(vaccf0123_2, vzero);
+              float32x4_t v0123_2 = vmulq_f32(vaccf0123_2, aph);
+              vaccf0123_2 = vbslq_f32(vflag0123_2, vaccf0123_2, v0123_2);
+            } else {
+              float32x4_t offset =
+                  vdupq_n_f32(act_param.hard_swish_offset / param.output_scale);
+              float32x4_t hs_scale =
+                  vdupq_n_f32(1.0 / act_param.hard_swish_scale);
+              float32x4_t thre = vdupq_n_f32(act_param.hard_swish_threshold /
+                                             param.output_scale);
+              float32x4_t vset0123_1 = vaddq_f32(vaccf0123_1, offset);
+              float32x4_t vscale0123_1 = vmulq_f32(vaccf0123_1, hs_scale);
+              vset0123_1 = vmaxq_f32(vset0123_1, vzero);
+              vset0123_1 = vminq_f32(vset0123_1, thre);
+              vaccf0123_1 = vmulq_f32(vscale0123_1, vset0123_1);
+              float32x4_t vset0123_2 = vaddq_f32(vaccf0123_2, offset);
+              float32x4_t vscale0123_2 = vmulq_f32(vaccf0123_2, hs_scale);
+              vset0123_2 = vmaxq_f32(vset0123_2, vzero);
+              vset0123_2 = vminq_f32(vset0123_2, thre);
+              vaccf0123_2 = vmulq_f32(vscale0123_2, vset0123_2);
+            }
+            vaccf0123_1 = vbslq_f32(vcgeq_f32(vaccf0123_1, vdupq_n_f32(-127.0)),
+                                    vaccf0123_1,
+                                    vdupq_n_f32(-127.0));
+            vaccf0123_2 = vbslq_f32(vcgeq_f32(vaccf0123_2, vdupq_n_f32(-127.0)),
+                                    vaccf0123_2,
+                                    vdupq_n_f32(-127.0));
+            float32x4_t vpos = vdupq_n_f32(0.5);
+            float32x4_t vneg = vdupq_n_f32(-0.5);
+            vaccf0123_1 = vbslq_f32(vcgeq_f32(vaccf0123_1, vzero),
+                                    vaddq_f32(vaccf0123_1, vpos),
+                                    vaddq_f32(vaccf0123_1, vneg));
+            vaccf0123_2 = vbslq_f32(vcgeq_f32(vaccf0123_2, vzero),
+                                    vaddq_f32(vaccf0123_2, vpos),
+                                    vaddq_f32(vaccf0123_2, vneg));
+
+            int32x4_t vacci0123_1 = vcvtq_s32_f32(vaccf0123_1);
+            int16x4_t v16i0123_1 = vqmovn_s32(vacci0123_1);
+            int16x4_t v16i4567_1 = vdup_n_s16(0);
+            int8x8_t v8i01234567_1 =
+                vqmovn_s16(vcombine_s16(v16i0123_1, v16i4567_1));
+
+            int32x4_t vacci0123_2 = vcvtq_s32_f32(vaccf0123_2);
+            int16x4_t v16i0123_2 = vqmovn_s32(vacci0123_2);
+            int16x4_t v16i4567_2 = vdup_n_s16(0);
+            int8x8_t v8i01234567_2 =
+                vqmovn_s16(vcombine_s16(v16i0123_2, v16i4567_2));
+
+            if (mc == 1) {
+              vst1_lane_s8(out_ptr1, v8i01234567_1, 0);
+              vst1_lane_s8(out_ptr2, v8i01234567_2, 0);
+            } else if (mc == 2) {
+              vst1_lane_s8(out_ptr1, v8i01234567_1, 0);
+              vst1_lane_s8(out_ptr2, v8i01234567_2, 0);
+              vst1_lane_s8(out_ptr1 + 1, v8i01234567_1, 1);
+              vst1_lane_s8(out_ptr2 + 1, v8i01234567_2, 1);
+            } else {
+              vst1_lane_s8(out_ptr1, v8i01234567_1, 0);
+              vst1_lane_s8(out_ptr2, v8i01234567_2, 0);
+              vst1_lane_s8(out_ptr1 + 1, v8i01234567_1, 1);
+              vst1_lane_s8(out_ptr2 + 1, v8i01234567_2, 1);
+              vst1_lane_s8(out_ptr1 + 2, v8i01234567_1, 2);
+              vst1_lane_s8(out_ptr2 + 2, v8i01234567_2, 2);
+            }
+          }
+          LITE_PARALLEL_COMMON_END();
+          if
+            SPARSE_UNLIKELY(lave_num != 0) {
+              int8_t* out_ptr = reinterpret_cast<int8_t*>(
+                  (uintptr_t)output + output_stride * 2 * pair_num);
+              const int8_t* cur_w = A;
+              uint32_t nnz = nidx_nnzmap[pair_num];
+              const int8_t* cur_b = B;
+              const int32_t* dmap = widx_dmap;
+              if (pair_num != 0) {
+                cur_w = A + nidx_nnzmap[pair_num - 1] * 2;
+                nnz = nidx_nnzmap[pair_num] - nidx_nnzmap[pair_num - 1];
+                cur_b += ((nidx_nnzmap[pair_num - 1] == 0)
+                              ? 0
+                              : widx_dmap[nidx_nnzmap[pair_num - 1] - 1] /
+                                    sizeof(int8_t));
+                dmap = widx_dmap + nidx_nnzmap[pair_num - 1];
+              }
+
+              float32x4_t vbias = (bias != nullptr)
+                                      ? vdupq_n_f32(bias[pair_num * 2])
+                                      : vdupq_n_f32(0);
+              float vscale = scale[pair_num * 2];
+              int32x4_t vacc0123 = vdupq_n_s32(0);
+
+              for (int j = 0; j < nnz; j++) {
+                int8x8_t vi0123 = vdup_n_s8(0);
+                if (mc == 1) {
+                  vi0123 = vld1_lane_s8(cur_b, vi0123, 0);
+                } else if (mc == 2) {
+                  vi0123 = vld1_lane_s8(cur_b, vi0123, 0);
+                  vi0123 = vld1_lane_s8(cur_b + 1, vi0123, 1);
+                } else {
+                  vi0123 = vld1_lane_s8(cur_b, vi0123, 0);
+                  vi0123 = vld1_lane_s8(cur_b + 1, vi0123, 1);
+                  vi0123 = vld1_lane_s8(cur_b + 2, vi0123, 2);
+                }
+                int8x8_t vw = vld1_dup_s8(cur_w);
+                cur_w += 1;
+                intptr_t diff = *dmap++;
+                cur_b = (const int8_t*)((uintptr_t)cur_b + (uintptr_t)diff);
+                int16x8_t vo0123 = vmull_s8(vi0123, vw);
+                vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vo0123));
+              }
+              float32x4_t vaccf0123 = vcvtq_f32_s32(vacc0123);
+              vaccf0123 = vmlaq_n_f32(vbias, vaccf0123, vscale);
+              float32x4_t vzero = vdupq_n_f32(0);
+              if (flag_act == 1) {
+                vaccf0123 = vmaxq_f32(vaccf0123, vzero);
+              } else if (flag_act == 0) {
+              } else if (flag_act == 2) {
+                float32x4_t aph = vdupq_n_f32(alpha);
+                vaccf0123 = vmaxq_f32(vaccf0123, vzero);
+                vaccf0123 = vminq_f32(vaccf0123, aph);
+              } else if (flag_act == 3) {
+                float32x4_t aph = vdupq_n_f32(alpha);
+                uint32x4_t vflag0123 = vcgeq_f32(vaccf0123, vzero);
+                float32x4_t v0123 = vmulq_f32(vaccf0123, aph);
+                vaccf0123 = vbslq_f32(vflag0123, vaccf0123, v0123);
+              } else {
+                float32x4_t offset = vdupq_n_f32(act_param.hard_swish_offset /
+                                                 param.output_scale);
+                float32x4_t hs_scale =
+                    vdupq_n_f32(1.0 / act_param.hard_swish_scale);
+                float32x4_t thre = vdupq_n_f32(act_param.hard_swish_threshold /
+                                               param.output_scale);
+                float32x4_t vset0123 = vaddq_f32(vaccf0123, offset);
+                float32x4_t vscale0123 = vmulq_f32(vaccf0123, hs_scale);
+                vset0123 = vmaxq_f32(vset0123, vzero);
+                vset0123 = vminq_f32(vset0123, thre);
+                vaccf0123 = vmulq_f32(vscale0123, vset0123);
+              }
+              vaccf0123 = vbslq_f32(vcgeq_f32(vaccf0123, vdupq_n_f32(-127.0)),
+                                    vaccf0123,
+                                    vdupq_n_f32(-127.0));
+              float32x4_t vpos = vdupq_n_f32(0.5);
+              float32x4_t vneg = vdupq_n_f32(-0.5);
+              vaccf0123 = vbslq_f32(vcgeq_f32(vaccf0123, vzero),
+                                    vaddq_f32(vaccf0123, vpos),
+                                    vaddq_f32(vaccf0123, vneg));
+
+              int32x4_t vacci0123 = vcvtq_s32_f32(vaccf0123);
+
+              int16x4_t v16i0123 = vqmovn_s32(vacci0123);
+              int16x4_t v16i4567 = vdup_n_s16(0);
+              int8x8_t v8i01234567 =
+                  vqmovn_s16(vcombine_s16(v16i0123, v16i4567));
+
+              if (mc == 1) {
+                vst1_lane_s8(out_ptr, v8i01234567, 0);
+              } else if (mc == 2) {
+                vst1_lane_s8(out_ptr, v8i01234567, 0);
+                vst1_lane_s8(out_ptr + 1, v8i01234567, 1);
+              } else {
+                vst1_lane_s8(out_ptr, v8i01234567, 0);
+                vst1_lane_s8(out_ptr + 1, v8i01234567, 1);
+                vst1_lane_s8(out_ptr + 2, v8i01234567, 2);
+              }
+            }
+        }
+    }
+}
+
+#undef INIT_SEMI_CONV_PARAM
+#undef GET_SEMI_PARAM_TABLE
+#undef GET_UNSTRUCT_PARAM_TABLE
+#undef SET_ASSM_INPUT_PARAM1_V8_F32
+#undef SET_ASSM_INPUT_PARAM2_V8_F32
+#undef COMPUTE_ACT_NEON_TWO_V8_F32
+#undef COMPUTE_ACT_NEON_ONE_V8_F32
+#undef SET_ASSM_INPUT_PARAM1_V8_INT8_F32
+#undef SET_ASSM_INPUT_PARAM2_V8_INT8_F32
+#undef SET_ASSM_INPUT_PARAM1_V8_INT8_INT8
+#undef SET_ASSM_INPUT_PARAM2_V8_INT8_INT8
+
+#else  // armv7
+
+#define SPARSE_F32_F32_W24_v7_SEMI2_KERNEL \
+  "vld1.32    {d0},   [%[bias_ptr]]\n"     \
+  "vdup.32    q4,    d0[0]\n"              \
+  "vdup.32    q5,    d0[0]\n"              \
+  "vdup.32    q6,    d0[0]\n"              \
+  "pld  [%[a_ptr], #128]    \n"            \
+  "vdup.32    q7,    d0[0]\n"              \
+  "vdup.32    q8,    d0[0]\n"              \
+  "vdup.32    q9,    d0[0]\n"              \
+  "pld  [%[widx_dmap], #128]    \n"        \
+  "vdup.32    q10,   d0[1]\n"              \
+  "vdup.32    q11,   d0[1]\n"              \
+  "vdup.32    q12,   d0[1]\n"              \
+  "pld  [%[b_ptr], #128]    \n"            \
+  "vdup.32    q13,   d0[1]\n"              \
+  "vdup.32    q14,   d0[1]\n"              \
+  "vdup.32    q15,   d0[1]\n"              \
+  "cmp    %[k],    #0\n"                   \
+  "beq    1f\n" /* main loop*/             \
+  "0:\n"                                   \
+  "vld1.32    {d0},   [%[a_ptr]]!\n"       \
+  "ldr   r0, [%[widx_dmap]],   #4\n"       \
+  "vld1.32    {d2-d5},   [%[b_ptr]]\n"     \
+  "vmla.f32    q4,  q1,  d0[0]\n"          \
+  "vmla.f32    q5,  q2,  d0[0]\n"          \
+  "vmla.f32    q10, q1,  d0[1]\n"          \
+  "vmla.f32    q11, q2,  d0[1]\n"          \
+  "subs    %[k],   %[k],   #1\n"           \
+  "mov   r1,   %[b_ptr]\n"                 \
+  "add  r1,  r1,   #32\n"                  \
+  "vld1.32    {d4-d7},   [r1]\n"           \
+  "vmla.f32    q6,  q2,  d0[0]\n"          \
+  "vmla.f32    q12, q2,  d0[1]\n"          \
+  "add  r1,  r1,   #32\n"                  \
+  "add   %[b_ptr],  %[b_ptr], r0\n"        \
+  "vmla.f32    q7,  q3,  d0[0]\n"          \
+  "vmla.f32    q13, q3,  d0[1]\n"          \
+  "vld1.32    {d2-d5},   [r1]\n"           \
+  "vmla.f32    q8,  q1,  d0[0]\n"          \
+  "vmla.f32    q14, q1,  d0[1]\n"          \
+  "vmla.f32    q9,  q2,  d0[0]\n"          \
+  "vmla.f32    q15, q2,  d0[1]\n"          \
+  "bne     0b\n"                           \
+  "1:\n"
+
+#define SPARSE_F32_F32_W16_v7_SEMI2_KERNEL \
+  "vld1.32    {d0},   [%[bias_ptr]]\n"     \
+  "pld  [%[a_ptr], #128]    \n"            \
+  "vdup.32    q8,    d0[0]\n"              \
+  "vdup.32    q9,    d0[0]\n"              \
+  "pld  [%[widx_dmap], #128]    \n"        \
+  "vdup.32    q10,   d0[0]\n"              \
+  "vdup.32    q11,   d0[0]\n"              \
+  "vdup.32    q12,   d0[1]\n"              \
+  "pld  [%[b_ptr], #128]    \n"            \
+  "vdup.32    q13,   d0[1]\n"              \
+  "vdup.32    q14,   d0[1]\n"              \
+  "vdup.32    q15,   d0[1]\n"              \
+  "cmp    %[k],    #0\n"                   \
+  "beq    1f\n" /* main loop*/             \
+  "0:\n"                                   \
+  "vld1.32    {d0},   [%[a_ptr]]!\n"       \
+  "ldr   r0, [%[widx_dmap]],   #4\n"       \
+  "vld1.32    {d2-d5},   [%[b_ptr]]\n"     \
+  "vmla.f32    q8,  q1,  d0[0]\n"          \
+  "vmla.f32    q9,  q2,  d0[0]\n"          \
+  "vmla.f32    q12, q1,  d0[1]\n"          \
+  "vmla.f32    q13, q2,  d0[1]\n"          \
+  "subs    %[k],   %[k],   #1\n"           \
+  "mov   r1,   %[b_ptr]\n"                 \
+  "add   %[b_ptr],  %[b_ptr], r0\n"        \
+  "add  r1,  r1,   #32\n"                  \
+  "vld1.32    {d6-d9},   [r1]\n"           \
+  "vmla.f32    q10,  q3,  d0[0]\n"         \
+  "vmla.f32    q11,  q4,  d0[0]\n"         \
+  "vmla.f32    q14,  q3,  d0[1]\n"         \
+  "vmla.f32    q15,  q4,  d0[1]\n"         \
+  "bne     0b\n"                           \
+  "1:\n"
+
+#define SPARSE_F32_F32_W8_v7_SEMI2_KERNEL \
+  "vld1.32    {d0},   [%[bias_ptr]]\n"    \
+  "vdup.32    q8,    d0[0]\n"             \
+  "vdup.32    q9,    d0[0]\n"             \
+  "vdup.32    q10,   d0[1]\n"             \
+  "vdup.32    q11,   d0[1]\n"             \
+  "cmp    %[k],    #0\n"                  \
+  "beq    1f\n" /* main loop*/            \
+  "0:\n"                                  \
+  "vld1.32    {d0},   [%[a_ptr]]!\n"      \
+  "ldr   r0, [%[widx_dmap]],   #4\n"      \
+  "subs    %[k],   %[k],   #1\n"          \
+  "vld1.32    {d2-d5},   [%[b_ptr]]\n"    \
+  "vmla.f32    q8,  q1,  d0[0]\n"         \
+  "vmla.f32    q9,  q2,  d0[0]\n"         \
+  "add   %[b_ptr],  %[b_ptr], r0\n"       \
+  "vmla.f32    q10, q1,  d0[1]\n"         \
+  "vmla.f32    q11, q2,  d0[1]\n"         \
+  "bne     0b\n"                          \
+  "1:\n"
+
+#define SPARSE_F32_F32_W4_v7_SEMI2_KERNEL \
+  "vld1.32    {d0},   [%[bias_ptr]]\n"    \
+  "vdup.32    q8,    d0[0]\n"             \
+  "vdup.32    q9,    d0[1]\n"             \
+  "cmp    %[k],    #0\n"                  \
+  "beq    1f\n" /* main loop*/            \
+  "0:\n"                                  \
+  "vld1.32    {d0},   [%[a_ptr]]!\n"      \
+  "ldr   r0, [%[widx_dmap]],   #4\n"      \
+  "subs    %[k],   %[k],   #1\n"          \
+  "vld1.32    {d2-d3},   [%[b_ptr]]\n"    \
+  "vmla.f32    q8,  q1,  d0[0]\n"         \
+  "add   %[b_ptr],  %[b_ptr], r0\n"       \
+  "vmla.f32    q9,  q1,  d0[1]\n"         \
+  "bne     0b\n"                          \
+  "1:\n"
+
+#define SPARSE_F32_F32_W24_v7_SEMI1_KERNEL \
+  "vdup.32    q10,    %[vbias]\n"          \
+  "pld  [%[a_ptr], #128]    \n"            \
+  "vdup.32    q11,   d20[0]\n"             \
+  "pld  [%[widx_dmap], #128]    \n"        \
+  "vdup.32    q12,   d20[0]\n"             \
+  "vdup.32    q13,   d20[0]\n"             \
+  "pld  [%[b_ptr], #128]    \n"            \
+  "vdup.32    q14,   d20[0]\n"             \
+  "vdup.32    q15,   d20[0]\n"             \
+  "cmp    %[k],    #0\n"                   \
+  "beq    1f\n" /* main loop*/             \
+  "0:\n"                                   \
+  "ldr   r0, [%[a_ptr]], #4\n"             \
+  "ldr   r1, [%[widx_dmap]],   #4\n"       \
+  "vdup.32    q0,   r0\n"                  \
+  "vld1.32    {d2-d5},   [%[b_ptr]]\n"     \
+  "vmla.f32    q10,  q1,  q0\n"            \
+  "vmla.f32    q11,  q2,  q0\n"            \
+  "mov   r2,   %[b_ptr]\n"                 \
+  "add   %[b_ptr],  %[b_ptr], r1\n"        \
+  "add  r2,  r2,   #32\n"                  \
+  "vld1.32    {d6-d9},   [r2]\n"           \
+  "vmla.f32    q12,  q3,  q0\n"            \
+  "vmla.f32    q13,  q4,  q0\n"            \
+  "subs    %[k],   %[k],   #1\n"           \
+  "add  r2,  r2,   #32\n"                  \
+  "vld1.32    {d10-d13},   [r2]\n"         \
+  "vmla.f32    q14,  q5,  q0\n"            \
+  "vmla.f32    q15,  q6,  q0\n"            \
+  "bne     0b\n"                           \
+  "1:\n"
+
+#define SPARSE_F32_F32_W16_v7_SEMI1_KERNEL \
+  "vdup.32    q10,    %[vbias]\n"          \
+  "vdup.32    q11,   d20[0]\n"             \
+  "vdup.32    q12,   d20[0]\n"             \
+  "vdup.32    q13,   d20[0]\n"             \
+  "cmp    %[k],    #0\n"                   \
+  "beq    1f\n" /* main loop*/             \
+  "0:\n"                                   \
+  "ldr   r0, [%[a_ptr]], #4\n"             \
+  "ldr   r1, [%[widx_dmap]],   #4\n"       \
+  "vdup.32    q0,   r0\n"                  \
+  "vld1.32    {d2-d5},   [%[b_ptr]]\n"     \
+  "vmla.f32    q10,  q1,  q0\n"            \
+  "vmla.f32    q11,  q2,  q0\n"            \
+  "mov   r2,   %[b_ptr]\n"                 \
+  "add   %[b_ptr],  %[b_ptr], r1\n"        \
+  "add  r2,  r2,   #32\n"                  \
+  "vld1.32    {d6-d9},   [r2]\n"           \
+  "subs    %[k],   %[k],   #1\n"           \
+  "vmla.f32    q12,  q3,  q0\n"            \
+  "vmla.f32    q13,  q4,  q0\n"            \
+  "bne     0b\n"                           \
+  "1:\n"
+
+#define SPARSE_F32_F32_W8_v7_SEMI1_KERNEL \
+  "vdup.32    q10,    %[vbias]\n"         \
+  "vdup.32    q11,   d20[0]\n"            \
+  "cmp    %[k],    #0\n"                  \
+  "beq    1f\n" /* main loop*/            \
+  "0:\n"                                  \
+  "ldr   r0, [%[a_ptr]], #4\n"            \
+  "ldr   r1, [%[widx_dmap]],   #4\n"      \
+  "subs    %[k],   %[k],   #1\n"          \
+  "vdup.32    q0,   r0\n"                 \
+  "vld1.32    {d2-d5},   [%[b_ptr]]\n"    \
+  "vmla.f32    q10,  q1,  q0\n"           \
+  "add   %[b_ptr],  %[b_ptr], r1\n"       \
+  "vmla.f32    q11,  q2,  q0\n"           \
+  "bne     0b\n"                          \
+  "1:\n"
+
+#define SPARSE_F32_F32_W4_v7_SEMI1_KERNEL \
+  "vdup.32    q10,    %[vbias]\n"         \
+  "cmp    %[k],    #0\n"                  \
+  "beq    1f\n" /* main loop*/            \
+  "0:\n"                                  \
+  "ldr   r0, [%[a_ptr]], #4\n"            \
+  "ldr   r1, [%[widx_dmap]],   #4\n"      \
+  "subs    %[k],   %[k],   #1\n"          \
+  "vdup.32    q0,   r0\n"                 \
+  "vld1.32    {d2-d3},   [%[b_ptr]]\n"    \
+  "vmla.f32    q10,  q1,  q0\n"           \
+  "add   %[b_ptr],  %[b_ptr], r1\n"       \
+  "bne     0b\n"                          \
+  "1:\n"
+
+#define SPARSE_F32_F32_W24_v7_SEMI2_RELU         \
+  /* do relu */                                  \
+  "cmp    %[vflag_act],   #0\n" /* skip relu */  \
+  "beq   9f                 \n" /* no act end */ \
+  "cmp    %[vflag_act],   #1\n" /* skip relu */  \
+  "bne   10f                \n" /* other act */  \
+  "vmov.i32   q0, #0\n"         /* for relu */   \
+  "vmax.f32   q4,   q4,   q0\n" /* relu */       \
+  "vmax.f32   q5,   q5,   q0\n" /* relu */       \
+  "vmax.f32   q6,   q6,   q0\n" /* relu */       \
+  "vmax.f32   q7,   q7,   q0\n" /* relu */       \
+  "vmax.f32   q8,   q8,   q0\n" /* relu */       \
+  "vmax.f32   q9,   q9,   q0\n" /* relu */       \
+  "vmax.f32   q10,  q10,  q0\n" /* relu */       \
+  "vmax.f32   q11,  q11,  q0\n" /* relu */       \
+  "vmax.f32   q12,  q12,  q0\n" /* relu */       \
+  "vmax.f32   q13,  q13,  q0\n" /* relu */       \
+  "vmax.f32   q14,  q14,  q0\n" /* relu */       \
+  "vmax.f32   q15,  q15,  q0\n" /* relu */       \
+  "b      9f                \n" /* relu end */
+
+#define SPARSE_F32_F32_W16_v7_SEMI2_RELU         \
+  /* do relu */                                  \
+  "cmp    %[vflag_act],   #0\n" /* skip relu */  \
+  "beq   9f                 \n" /* no act end */ \
+  "cmp    %[vflag_act],   #1\n" /* skip relu */  \
+  "bne   10f                \n" /* other act */  \
+  "vmov.i32   q0, #0\n"         /* for relu */   \
+  "vmax.f32   q8,   q8,   q0\n" /* relu */       \
+  "vmax.f32   q9,   q9,   q0\n" /* relu */       \
+  "vmax.f32   q10,  q10,  q0\n" /* relu */       \
+  "vmax.f32   q11,  q11,  q0\n" /* relu */       \
+  "vmax.f32   q12,  q12,  q0\n" /* relu */       \
+  "vmax.f32   q13,  q13,  q0\n" /* relu */       \
+  "vmax.f32   q14,  q14,  q0\n" /* relu */       \
+  "vmax.f32   q15,  q15,  q0\n" /* relu */       \
+  "b      9f                \n" /* relu end */
+
+#define SPARSE_F32_F32_W8_v7_SEMI2_RELU          \
+  /* do relu */                                  \
+  "cmp    %[vflag_act],   #0\n" /* skip relu */  \
+  "beq   9f                 \n" /* no act end */ \
+  "cmp    %[vflag_act],   #1\n" /* skip relu */  \
+  "bne   10f                \n" /* other act */  \
+  "vmov.i32   q0, #0\n"         /* for relu */   \
+  "vmax.f32   q8,   q8,   q0\n" /* relu */       \
+  "vmax.f32   q9,   q9,   q0\n" /* relu */       \
+  "vmax.f32   q10,  q10,  q0\n" /* relu */       \
+  "vmax.f32   q11,  q11,  q0\n" /* relu */       \
+  "b      9f                \n" /* relu end */
+
+#define SPARSE_F32_F32_W4_v7_SEMI2_RELU          \
+  /* do relu */                                  \
+  "cmp    %[vflag_act],   #0\n" /* skip relu */  \
+  "beq   9f                 \n" /* no act end */ \
+  "cmp    %[vflag_act],   #1\n" /* skip relu */  \
+  "bne   10f                \n" /* other act */  \
+  "vmov.i32   q0, #0\n"         /* for relu */   \
+  "vmax.f32   q8,   q8,   q0\n" /* relu */       \
+  "vmax.f32   q9,   q9,   q0\n" /* relu */       \
+  "b      9f                \n" /* relu end */
+
+#define SPARSE_F32_F32_W24_v7_SEMI1_RELU         \
+  /* do relu */                                  \
+  "cmp    %[vflag_act],   #0\n" /* skip relu */  \
+  "beq   9f                 \n" /* no act end */ \
+  "cmp    %[vflag_act],   #1\n" /* skip relu */  \
+  "bne   10f                \n" /* other act */  \
+  "vmov.i32   q0, #0\n"         /* for relu */   \
+  "vmax.f32   q10,  q10,  q0\n" /* relu */       \
+  "vmax.f32   q11,  q11,  q0\n" /* relu */       \
+  "vmax.f32   q12,  q12,  q0\n" /* relu */       \
+  "vmax.f32   q13,  q13,  q0\n" /* relu */       \
+  "vmax.f32   q14,  q14,  q0\n" /* relu */       \
+  "vmax.f32   q15,  q15,  q0\n" /* relu */       \
+  "b      9f                \n" /* relu end */
+
+#define SPARSE_F32_F32_W16_v7_SEMI1_RELU         \
+  /* do relu */                                  \
+  "cmp    %[vflag_act],   #0\n" /* skip relu */  \
+  "beq   9f                 \n" /* no act end */ \
+  "cmp    %[vflag_act],   #1\n" /* skip relu */  \
+  "bne   10f                \n" /* other act */  \
+  "vmov.i32   q0, #0\n"         /* for relu */   \
+  "vmax.f32   q10,  q10,  q0\n" /* relu */       \
+  "vmax.f32   q11,  q11,  q0\n" /* relu */       \
+  "vmax.f32   q12,  q12,  q0\n" /* relu */       \
+  "vmax.f32   q13,  q13,  q0\n" /* relu */       \
+  "b      9f                \n" /* relu end */
+
+#define SPARSE_F32_F32_W8_v7_SEMI1_RELU          \
+  /* do relu */                                  \
+  "cmp    %[vflag_act],   #0\n" /* skip relu */  \
+  "beq   9f                 \n" /* no act end */ \
+  "cmp    %[vflag_act],   #1\n" /* skip relu */  \
+  "bne   10f                \n" /* other act */  \
+  "vmov.i32   q0, #0\n"         /* for relu */   \
+  "vmax.f32   q10,  q10,  q0\n" /* relu */       \
+  "vmax.f32   q11,  q11,  q0\n" /* relu */       \
+  "b      9f                \n" /* relu end */
+
+#define SPARSE_F32_F32_W4_v7_SEMI1_RELU          \
+  /* do relu */                                  \
+  "cmp    %[vflag_act],   #0\n" /* skip relu */  \
+  "beq   9f                 \n" /* no act end */ \
+  "cmp    %[vflag_act],   #1\n" /* skip relu */  \
+  "bne   10f                \n" /* other act */  \
+  "vmov.i32   q0, #0\n"         /* for relu */   \
+  "vmax.f32   q10,  q10,  q0\n" /* relu */       \
+  "b      9f                \n" /* relu end */
+
+#define SPARSE_F32_F32_W24_v7_SEMI2_RELU6              \
+  /* do relu6 */                                       \
+  "10: \n"                                             \
+  "cmp   %[vflag_act],  #2       \n" /* check relu6 */ \
+  "bne   11f                     \n" /* no act end */  \
+  "vmov.i32   q0,   #0\n"            /* for relu6 */   \
+  "vdup.32    q1,   %[valpha]\n"     /* relu6 alpha */ \
+  "vmax.f32   q4,   q4,   q0\n"      /* relu6 */       \
+  "vmax.f32   q5,   q5,   q0\n"      /* relu6 */       \
+  "vmax.f32   q6,   q6,   q0\n"      /* relu6 */       \
+  "vmax.f32   q7,   q7,   q0\n"      /* relu6 */       \
+  "vmax.f32   q8,   q8,   q0\n"      /* relu6 */       \
+  "vmax.f32   q9,   q9,   q0\n"      /* relu6 */       \
+  "vmax.f32   q10,  q10,  q0\n"      /* relu6 */       \
+  "vmax.f32   q11,  q11,  q0\n"      /* relu6 */       \
+  "vmax.f32   q12,  q12,  q0\n"      /* relu6 */       \
+  "vmax.f32   q13,  q13,  q0\n"      /* relu6 */       \
+  "vmax.f32   q14,  q14,  q0\n"      /* relu6 */       \
+  "vmax.f32   q15,  q15,  q0\n"      /* relu6 */       \
+  "vmin.f32   q4,   q4,   q1\n"      /* relu6 */       \
+  "vmin.f32   q5,   q5,   q1\n"      /* relu6 */       \
+  "vmin.f32   q6,   q6,   q1\n"      /* relu6 */       \
+  "vmin.f32   q7,   q7,   q1\n"      /* relu6 */       \
+  "vmin.f32   q8,   q8,   q1\n"      /* relu6 */       \
+  "vmin.f32   q9,   q9,   q1\n"      /* relu6 */       \
+  "vmin.f32   q10,  q10,  q1\n"      /* relu6 */       \
+  "vmin.f32   q11,  q11,  q1\n"      /* relu6 */       \
+  "vmin.f32   q12,  q12,  q1\n"      /* relu6 */       \
+  "vmin.f32   q13,  q13,  q1\n"      /* relu6 */       \
+  "vmin.f32   q14,  q14,  q1\n"      /* relu6 */       \
+  "vmin.f32   q15,  q15,  q1\n"      /* relu6 */       \
+  "b      9f                    \n"  /* relu end */
+
+#define SPARSE_F32_F32_W16_v7_SEMI2_RELU6              \
+  /* do relu6 */                                       \
+  "10: \n"                                             \
+  "cmp   %[vflag_act],  #2       \n" /* check relu6 */ \
+  "bne   11f                     \n" /* no act end */  \
+  "vmov.i32   q0,   #0\n"            /* for relu6 */   \
+  "vdup.32    q1,   %[valpha]\n"     /* relu6 alpha */ \
+  "vmax.f32   q8,   q8,   q0\n"      /* relu6 */       \
+  "vmax.f32   q9,   q9,   q0\n"      /* relu6 */       \
+  "vmax.f32   q10,  q10,  q0\n"      /* relu6 */       \
+  "vmax.f32   q11,  q11,  q0\n"      /* relu6 */       \
+  "vmax.f32   q12,  q12,  q0\n"      /* relu6 */       \
+  "vmax.f32   q13,  q13,  q0\n"      /* relu6 */       \
+  "vmax.f32   q14,  q14,  q0\n"      /* relu6 */       \
+  "vmax.f32   q15,  q15,  q0\n"      /* relu6 */       \
+  "vmin.f32   q8,   q8,   q1\n"      /* relu6 */       \
+  "vmin.f32   q9,   q9,   q1\n"      /* relu6 */       \
+  "vmin.f32   q10,  q10,  q1\n"      /* relu6 */       \
+  "vmin.f32   q11,  q11,  q1\n"      /* relu6 */       \
+  "vmin.f32   q12,  q12,  q1\n"      /* relu6 */       \
+  "vmin.f32   q13,  q13,  q1\n"      /* relu6 */       \
+  "vmin.f32   q14,  q14,  q1\n"      /* relu6 */       \
+  "vmin.f32   q15,  q15,  q1\n"      /* relu6 */       \
+  "b      9f                    \n"  /* relu end */
+
+#define SPARSE_F32_F32_W8_v7_SEMI2_RELU6               \
+  /* do relu6 */                                       \
+  "10: \n"                                             \
+  "cmp   %[vflag_act],  #2       \n" /* check relu6 */ \
+  "bne   11f                     \n" /* no act end */  \
+  "vmov.i32   q0,   #0\n"            /* for relu6 */   \
+  "vdup.32    q1,   %[valpha]\n"     /* relu6 alpha */ \
+  "vmax.f32   q8,   q8,   q0\n"      /* relu6 */       \
+  "vmax.f32   q9,   q9,   q0\n"      /* relu6 */       \
+  "vmax.f32   q10,  q10,  q0\n"      /* relu6 */       \
+  "vmax.f32   q11,  q11,  q0\n"      /* relu6 */       \
+  "vmin.f32   q8,   q8,   q1\n"      /* relu6 */       \
+  "vmin.f32   q9,   q9,   q1\n"      /* relu6 */       \
+  "vmin.f32   q10,  q10,  q1\n"      /* relu6 */       \
+  "vmin.f32   q11,  q11,  q1\n"      /* relu6 */       \
+  "b      9f                    \n"  /* relu end */
+
+#define SPARSE_F32_F32_W4_v7_SEMI2_RELU6               \
+  /* do relu6 */                                       \
+  "10: \n"                                             \
+  "cmp   %[vflag_act],  #2       \n" /* check relu6 */ \
+  "bne   11f                     \n" /* no act end */  \
+  "vmov.i32   q0,   #0\n"            /* for relu6 */   \
+  "vdup.32    q1,   %[valpha]\n"     /* relu6 alpha */ \
+  "vmax.f32   q8,   q8,   q0\n"      /* relu6 */       \
+  "vmax.f32   q9,   q9,   q0\n"      /* relu6 */       \
+  "vmin.f32   q8,   q8,   q1\n"      /* relu6 */       \
+  "vmin.f32   q9,   q9,   q1\n"      /* relu6 */       \
+  "b      9f                    \n"  /* relu end */
+
+#define SPARSE_F32_F32_W24_v7_SEMI1_RELU6              \
+  /* do relu6 */                                       \
+  "10: \n"                                             \
+  "cmp   %[vflag_act],  #2       \n" /* check relu6 */ \
+  "bne   11f                     \n" /* no act end */  \
+  "vmov.i32   q0,   #0\n"            /* for relu6 */   \
+  "vdup.32    q1,   %[valpha]\n"     /* relu6 alpha */ \
+  "vmax.f32   q10,  q10,  q0\n"      /* relu6 */       \
+  "vmax.f32   q11,  q11,  q0\n"      /* relu6 */       \
+  "vmax.f32   q12,  q12,  q0\n"      /* relu6 */       \
+  "vmax.f32   q13,  q13,  q0\n"      /* relu6 */       \
+  "vmax.f32   q14,  q14,  q0\n"      /* relu6 */       \
+  "vmax.f32   q15,  q15,  q0\n"      /* relu6 */       \
+  "vmin.f32   q10,  q10,  q1\n"      /* relu6 */       \
+  "vmin.f32   q11,  q11,  q1\n"      /* relu6 */       \
+  "vmin.f32   q12,  q12,  q1\n"      /* relu6 */       \
+  "vmin.f32   q13,  q13,  q1\n"      /* relu6 */       \
+  "vmin.f32   q14,  q14,  q1\n"      /* relu6 */       \
+  "vmin.f32   q15,  q15,  q1\n"      /* relu6 */       \
+  "b      9f                    \n"  /* relu end */
+
+#define SPARSE_F32_F32_W16_v7_SEMI1_RELU6              \
+  /* do relu6 */                                       \
+  "10: \n"                                             \
+  "cmp   %[vflag_act],  #2       \n" /* check relu6 */ \
+  "bne   11f                     \n" /* no act end */  \
+  "vmov.i32   q0,   #0\n"            /* for relu6 */   \
+  "vdup.32    q1,   %[valpha]\n"     /* relu6 alpha */ \
+  "vmax.f32   q10,  q10,  q0\n"      /* relu6 */       \
+  "vmax.f32   q11,  q11,  q0\n"      /* relu6 */       \
+  "vmax.f32   q12,  q12,  q0\n"      /* relu6 */       \
+  "vmax.f32   q13,  q13,  q0\n"      /* relu6 */       \
+  "vmin.f32   q10,  q10,  q1\n"      /* relu6 */       \
+  "vmin.f32   q11,  q11,  q1\n"      /* relu6 */       \
+  "vmin.f32   q12,  q12,  q1\n"      /* relu6 */       \
+  "vmin.f32   q13,  q13,  q1\n"      /* relu6 */       \
+  "b      9f                    \n"  /* relu end */
+
+#define SPARSE_F32_F32_W8_v7_SEMI1_RELU6               \
+  /* do relu6 */                                       \
+  "10: \n"                                             \
+  "cmp   %[vflag_act],  #2       \n" /* check relu6 */ \
+  "bne   11f                     \n" /* no act end */  \
+  "vmov.i32   q0,   #0\n"            /* for relu6 */   \
+  "vdup.32    q1,   %[valpha]\n"     /* relu6 alpha */ \
+  "vmax.f32   q10,  q10,  q0\n"      /* relu6 */       \
+  "vmax.f32   q11,  q11,  q0\n"      /* relu6 */       \
+  "vmin.f32   q10,  q10,  q1\n"      /* relu6 */       \
+  "vmin.f32   q11,  q11,  q1\n"      /* relu6 */       \
+  "b      9f                    \n"  /* relu end */
+
+#define SPARSE_F32_F32_W4_v7_SEMI1_RELU6               \
+  /* do relu6 */                                       \
+  "10: \n"                                             \
+  "cmp   %[vflag_act],  #2       \n" /* check relu6 */ \
+  "bne   11f                     \n" /* no act end */  \
+  "vmov.i32   q0,   #0\n"            /* for relu6 */   \
+  "vdup.32    q1,   %[valpha]\n"     /* relu6 alpha */ \
+  "vmax.f32   q10,  q10,  q0\n"      /* relu6 */       \
+  "vmin.f32   q10,  q10,  q1\n"      /* relu6 */       \
+  "b      9f                    \n"  /* relu end */
+
+#define SPARSE_F32_F32_W24_v7_SEMI2_LEAKY_RELU                 \
+  /* do relu */                                                \
+  "11: \n"                                                     \
+  "cmp   %[vflag_act],  #3       \n"   /* check leakey relu */ \
+  "bne   12f                     \n"   /* no act end */        \
+  "vmov.i32   q0, #0\n"                /* for relu */          \
+  "vdup.32    q1,  %[valpha]\n"        /* leakey relu alpha */ \
+  "vcge.f32   q2,    q4,    q0     \n" /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q4,    q1     \n" /* vmulq_f32 */         \
+  "vbif       q4,    q3,    q2     \n"                         \
+  "vcge.f32   q2,    q5,    q0     \n" /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q5,    q1     \n" /* vmulq_f32 */         \
+  "vbif       q5,    q3,    q2     \n"                         \
+  "vcge.f32   q2,    q6,    q0     \n" /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q6,    q1     \n" /* vmulq_f32 */         \
+  "vbif       q6,    q3,    q2     \n"                         \
+  "vcge.f32   q2,    q7,    q0     \n" /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q7,    q1     \n" /* vmulq_f32 */         \
+  "vbif       q7,    q3,    q2     \n"                         \
+  "vcge.f32   q2,    q8,    q0     \n" /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q8,    q1     \n" /* vmulq_f32 */         \
+  "vbif       q8,    q3,    q2     \n"                         \
+  "vcge.f32   q2,    q9,    q0     \n" /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q9,    q1     \n" /* vmulq_f32 */         \
+  "vbif       q9,    q3,    q2     \n"                         \
+  "vcge.f32   q2,    q10,    q0    \n" /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q10,    q1    \n" /* vmulq_f32 */         \
+  "vbif       q10,    q3,    q2    \n"                         \
+  "vcge.f32   q2,    q11,    q0    \n" /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q11,    q1    \n" /* vmulq_f32 */         \
+  "vbif       q11,    q3,    q2    \n"                         \
+  "vcge.f32   q2,    q12,    q0    \n" /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q12,    q1    \n" /* vmulq_f32 */         \
+  "vbif       q12,    q3,    q2    \n"                         \
+  "vcge.f32   q2,    q13,    q0    \n" /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q13,    q1    \n" /* vmulq_f32 */         \
+  "vbif       q13,    q3,    q2    \n"                         \
+  "vcge.f32   q2,    q14,    q0    \n" /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q14,    q1    \n" /* vmulq_f32 */         \
+  "vbif       q14,    q3,    q2    \n"                         \
+  "vcge.f32   q2,    q15,    q0    \n" /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q15,    q1    \n" /* vmulq_f32 */         \
+  "vbif       q15,    q3,    q2    \n"                         \
+  "b      9f                    \n"
+
+#define SPARSE_F32_F32_W24_v7_SEMI2_HARD_SWISH                        \
+  /* do relu */                                                       \
+  "12: \n"                                                            \
+  "vld1.f32   {d0-d3}, [%[hs_param]]!      @ load hard swish alpha\n" \
+  "vadd.f32   q3, q4, q0                \n"                           \
+  "vmul.f32   q4, q4, q1                \n"                           \
+  "vmov.u32   q2,   #0                  @ for hardswish \n"           \
+  "vmax.f32   q3, q3, q2                \n"                           \
+  "vld1.f32   {d4-d5}, [%[hs_param]]       \n"                        \
+  "vmin.f32   q3, q3, q2                \n"                           \
+  "vmov.u32   q2,   #0                  @ for hardswish \n"           \
+  "vmul.f32   q4, q4, q3                \n"                           \
+  "vadd.f32   q3, q5, q0                \n"                           \
+  "vmul.f32   q5, q5, q1                \n"                           \
+  "vmax.f32   q3, q3, q2                \n"                           \
+  "vld1.f32   {d4-d5}, [%[hs_param]]       \n"                        \
+  "vmin.f32   q3, q3, q2                \n"                           \
+  "vmov.u32   q2,   #0                  @ for hardswish \n"           \
+  "vmul.f32   q5, q5, q3                \n"                           \
+  "vadd.f32   q3, q6, q0                \n"                           \
+  "vmul.f32   q6, q6, q1                \n"                           \
+  "vmax.f32   q3, q3, q2                \n"                           \
+  "vld1.f32   {d4-d5}, [%[hs_param]]       \n"                        \
+  "vmin.f32   q3, q3, q2                \n"                           \
+  "vmov.u32   q2,   #0                  @ for hardswish \n"           \
+  "vmul.f32   q6, q6, q3                \n"                           \
+  "vadd.f32   q3, q7, q0                \n"                           \
+  "vmul.f32   q7, q7, q1                \n"                           \
+  "vmax.f32   q3, q3, q2                \n"                           \
+  "vld1.f32   {d4-d5}, [%[hs_param]]    \n"                           \
+  "vmin.f32   q3, q3, q2                \n"                           \
+  "vmov.u32   q2,   #0                  @ for hardswish \n"           \
+  "vmul.f32   q7, q7, q3                \n"                           \
+  "vadd.f32   q3, q8, q0                \n"                           \
+  "vmul.f32   q8, q8, q1                \n"                           \
+  "vmax.f32   q3, q3, q2                \n"                           \
+  "vld1.f32   {d4-d5}, [%[hs_param]]    \n"                           \
+  "vmin.f32   q3, q3, q2                \n"                           \
+  "vmov.u32   q2,   #0                  @ for hardswish \n"           \
+  "vmul.f32   q8, q8, q3                \n"                           \
+  "vadd.f32   q3, q9, q0                \n"                           \
+  "vmul.f32   q9, q9, q1                \n"                           \
+  "vmax.f32   q3, q3, q2                \n"                           \
+  "vld1.f32   {d4-d5}, [%[hs_param]]    \n"                           \
+  "vmin.f32   q3, q3, q2                \n"                           \
+  "vmov.u32   q2,   #0                  @ for hardswish \n"           \
+  "vmul.f32   q9, q9, q3                \n"                           \
+  "vadd.f32   q3, q10, q0               \n"                           \
+  "vmul.f32   q10, q10, q1              \n"                           \
+  "vmax.f32   q3, q3, q2                \n"                           \
+  "vld1.f32   {d4-d5}, [%[hs_param]]    \n"                           \
+  "vmin.f32   q3, q3, q2                \n"                           \
+  "vmov.u32   q2,   #0                  @ for hardswish \n"           \
+  "vmul.f32   q10, q10, q3              \n"                           \
+  "vadd.f32   q3, q11, q0               \n"                           \
+  "vmul.f32   q11, q11, q1              \n"                           \
+  "vmax.f32   q3, q3, q2                \n"                           \
+  "vld1.f32   {d4-d5}, [%[hs_param]]    \n"                           \
+  "vmin.f32   q3, q3, q2                \n"                           \
+  "vmov.u32   q2,   #0                  @ for hardswish \n"           \
+  "vmul.f32   q11, q11, q3              \n"                           \
+  "vadd.f32   q3, q12, q0               \n"                           \
+  "vmul.f32   q12, q12, q1              \n"                           \
+  "vmax.f32   q3, q3, q2                \n"                           \
+  "vld1.f32   {d4-d5}, [%[hs_param]]    \n"                           \
+  "vmin.f32   q3, q3, q2                \n"                           \
+  "vmov.u32   q2,   #0                  @ for hardswish \n"           \
+  "vmul.f32   q12, q12, q3              \n"                           \
+  "vadd.f32   q3, q13, q0               \n"                           \
+  "vmul.f32   q13, q13, q1              \n"                           \
+  "vmax.f32   q3, q3, q2                \n"                           \
+  "vld1.f32   {d4-d5}, [%[hs_param]]    \n"                           \
+  "vmin.f32   q3, q3, q2                \n"                           \
+  "vmov.u32   q2,   #0                  @ for hardswish \n"           \
+  "vmul.f32   q13, q13, q3              \n"                           \
+  "vadd.f32   q3, q14, q0               \n"                           \
+  "vmul.f32   q14, q14, q1              \n"                           \
+  "vmax.f32   q3, q3, q2                \n"                           \
+  "vld1.f32   {d4-d5}, [%[hs_param]]    \n"                           \
+  "vmin.f32   q3, q3, q2                \n"                           \
+  "vmov.u32   q2,   #0                  @ for hardswish \n"           \
+  "vmul.f32   q14, q14, q3              \n"                           \
+  "vadd.f32   q3, q15, q0               \n"                           \
+  "vmul.f32   q15, q15, q1              \n"                           \
+  "vmax.f32   q3, q3, q2                \n"                           \
+  "vld1.f32   {d4-d5}, [%[hs_param]]    \n"                           \
+  "vmin.f32   q3, q3, q2                \n"                           \
+  "vmov.u32   q2,   #0                  @ for hardswish \n"           \
+  "vmul.f32   q15, q15, q3              \n"                           \
+  "9:\n"
+
+#define SPARSE_F32_F32_W16_v7_SEMI2_LEAKY_RELU                  \
+  /* do relu */                                                 \
+  "11: \n"                                                      \
+  "cmp   %[vflag_act],  #3       \n"    /* check leakey relu */ \
+  "bne   12f                     \n"    /* no act end */        \
+  "vmov.i32   q0, #0\n"                 /* for relu */          \
+  "vdup.32    q1,  %[valpha]\n"         /* leakey relu alpha */ \
+  "vcge.f32   q2,    q8,    q0     \n"  /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q8,    q1     \n"  /* vmulq_f32 */         \
+  "vcge.f32   q4,    q9,    q0     \n"  /* vcgeq_f32 */         \
+  "vmul.f32   q5,    q9,    q1     \n"  /* vmulq_f32 */         \
+  "vcge.f32   q6,    q10,    q0     \n" /* vcgeq_f32 */         \
+  "vmul.f32   q7,    q10,    q1     \n" /* vmulq_f32 */         \
+  "vbif       q8,    q3,    q2     \n"                          \
+  "vbif       q9,    q5,    q4     \n"                          \
+  "vbif       q10,    q7,    q6     \n"                         \
+  "vcge.f32   q2,    q11,    q0     \n" /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q11,    q1     \n" /* vmulq_f32 */         \
+  "vcge.f32   q4,    q12,    q0     \n" /* vcgeq_f32 */         \
+  "vmul.f32   q5,    q12,    q1     \n" /* vmulq_f32 */         \
+  "vcge.f32   q6,    q13,    q0     \n" /* vcgeq_f32 */         \
+  "vmul.f32   q7,    q13,    q1     \n" /* vmulq_f32 */         \
+  "vbif       q11,    q3,    q2     \n"                         \
+  "vbif       q12,    q5,    q4     \n"                         \
+  "vbif       q13,    q7,    q6     \n"                         \
+  "vcge.f32   q2,    q14,    q0     \n" /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q14,    q1     \n" /* vmulq_f32 */         \
+  "vcge.f32   q4,    q15,    q0     \n" /* vcgeq_f32 */         \
+  "vmul.f32   q5,    q15,    q1     \n" /* vmulq_f32 */         \
+  "vbif       q14,    q3,    q2     \n"                         \
+  "vbif       q15,    q5,    q4     \n"                         \
+  "b      9f                    \n"
+
+#define SPARSE_F32_F32_W16_v7_SEMI2_HARD_SWISH \
+  "12: \n"                                     \
+  "vmov.u32   q0,   #0                     \n" \
+  "vld1.f32   {d2-d5}, [%[hs_param]]!      \n" \
+  "vld1.f32   {d6-d7}, [%[hs_param]]       \n" \
+  "vadd.f32   q4, q8, q1                \n"    \
+  "vadd.f32   q6, q9, q1                \n"    \
+  "vmul.f32   q5, q8, q2                \n"    \
+  "vmul.f32   q7, q9, q2                \n"    \
+  "vmax.f32   q4, q4, q0                \n"    \
+  "vmax.f32   q6, q6, q0                \n"    \
+  "vmin.f32   q4, q4, q3                \n"    \
+  "vmin.f32   q6, q6, q3                \n"    \
+  "vmul.f32   q8, q5, q4                \n"    \
+  "vmul.f32   q9, q7, q6                \n"    \
+  "vadd.f32   q4, q10, q1                \n"   \
+  "vadd.f32   q6, q11, q1                \n"   \
+  "vmul.f32   q5, q10, q2                \n"   \
+  "vmul.f32   q7, q11, q2                \n"   \
+  "vmax.f32   q4, q4, q0                \n"    \
+  "vmax.f32   q6, q6, q0                \n"    \
+  "vmin.f32   q4, q4, q3                \n"    \
+  "vmin.f32   q6, q6, q3                \n"    \
+  "vmul.f32   q10, q5, q4                \n"   \
+  "vmul.f32   q11, q7, q6                \n"   \
+  "vadd.f32   q4, q12, q1                \n"   \
+  "vadd.f32   q6, q13, q1                \n"   \
+  "vmul.f32   q5, q12, q2                \n"   \
+  "vmul.f32   q7, q13, q2                \n"   \
+  "vmax.f32   q4, q4, q0                \n"    \
+  "vmax.f32   q6, q6, q0                \n"    \
+  "vmin.f32   q4, q4, q3                \n"    \
+  "vmin.f32   q6, q6, q3                \n"    \
+  "vmul.f32   q12, q5, q4                \n"   \
+  "vmul.f32   q13, q7, q6                \n"   \
+  "vadd.f32   q4, q14, q1                \n"   \
+  "vadd.f32   q6, q15, q1                \n"   \
+  "vmul.f32   q5, q14, q2                \n"   \
+  "vmul.f32   q7, q15, q2                \n"   \
+  "vmax.f32   q4, q4, q0                \n"    \
+  "vmax.f32   q6, q6, q0                \n"    \
+  "vmin.f32   q4, q4, q3                \n"    \
+  "vmin.f32   q6, q6, q3                \n"    \
+  "vmul.f32   q14, q5, q4                \n"   \
+  "vmul.f32   q15, q7, q6                \n"   \
+  "9:\n"
+
+#define SPARSE_F32_F32_W8_v7_SEMI2_LEAKY_RELU                   \
+  /* do relu */                                                 \
+  "11: \n"                                                      \
+  "cmp   %[vflag_act],  #3       \n"    /* check leakey relu */ \
+  "bne   12f                     \n"    /* no act end */        \
+  "vmov.i32   q0, #0\n"                 /* for relu */          \
+  "vdup.32    q1,  %[valpha]\n"         /* leakey relu alpha */ \
+  "vcge.f32   q2,    q8,    q0     \n"  /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q8,    q1     \n"  /* vmulq_f32 */         \
+  "vcge.f32   q4,    q9,    q0     \n"  /* vcgeq_f32 */         \
+  "vmul.f32   q5,    q9,    q1     \n"  /* vmulq_f32 */         \
+  "vcge.f32   q6,    q10,    q0     \n" /* vcgeq_f32 */         \
+  "vmul.f32   q7,    q10,    q1     \n" /* vmulq_f32 */         \
+  "vbif       q8,    q3,    q2     \n"                          \
+  "vbif       q9,    q5,    q4     \n"                          \
+  "vbif       q10,    q7,    q6     \n"                         \
+  "vcge.f32   q2,    q11,    q0     \n" /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q11,    q1     \n" /* vmulq_f32 */         \
+  "vbif       q11,    q3,    q2     \n"                         \
+  "b      9f                    \n"
+
+#define SPARSE_F32_F32_W8_v7_SEMI2_HARD_SWISH  \
+  "12: \n"                                     \
+  "vmov.u32   q0,   #0                     \n" \
+  "vld1.f32   {d2-d5}, [%[hs_param]]!      \n" \
+  "vld1.f32   {d6-d7}, [%[hs_param]]       \n" \
+  "vadd.f32   q4, q8, q1                \n"    \
+  "vadd.f32   q6, q9, q1                \n"    \
+  "vmul.f32   q5, q8, q2                \n"    \
+  "vmul.f32   q7, q9, q2                \n"    \
+  "vmax.f32   q4, q4, q0                \n"    \
+  "vmax.f32   q6, q6, q0                \n"    \
+  "vmin.f32   q4, q4, q3                \n"    \
+  "vmin.f32   q6, q6, q3                \n"    \
+  "vmul.f32   q8, q5, q4                \n"    \
+  "vmul.f32   q9, q7, q6                \n"    \
+  "vadd.f32   q4, q10, q1                \n"   \
+  "vadd.f32   q6, q11, q1                \n"   \
+  "vmul.f32   q5, q10, q2                \n"   \
+  "vmul.f32   q7, q11, q2                \n"   \
+  "vmax.f32   q4, q4, q0                \n"    \
+  "vmax.f32   q6, q6, q0                \n"    \
+  "vmin.f32   q4, q4, q3                \n"    \
+  "vmin.f32   q6, q6, q3                \n"    \
+  "vmul.f32   q10, q5, q4                \n"   \
+  "vmul.f32   q11, q7, q6                \n"   \
+  "9:\n"
+
+#define SPARSE_F32_F32_W4_v7_SEMI2_LEAKY_RELU                  \
+  /* do relu */                                                \
+  "11: \n"                                                     \
+  "cmp   %[vflag_act],  #3       \n"   /* check leakey relu */ \
+  "bne   12f                     \n"   /* no act end */        \
+  "vmov.i32   q0, #0\n"                /* for relu */          \
+  "vdup.32    q1,  %[valpha]\n"        /* leakey relu alpha */ \
+  "vcge.f32   q2,    q8,    q0     \n" /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q8,    q1     \n" /* vmulq_f32 */         \
+  "vcge.f32   q4,    q9,    q0     \n" /* vcgeq_f32 */         \
+  "vmul.f32   q5,    q9,    q1     \n" /* vmulq_f32 */         \
+  "vbif       q8,    q3,    q2     \n"                         \
+  "vbif       q9,    q5,    q4     \n"                         \
+  "b      9f                    \n"
+
+#define SPARSE_F32_F32_W4_v7_SEMI2_HARD_SWISH  \
+  "12: \n"                                     \
+  "vmov.u32   q0,   #0                     \n" \
+  "vld1.f32   {d2-d5}, [%[hs_param]]!      \n" \
+  "vld1.f32   {d6-d7}, [%[hs_param]]       \n" \
+  "vadd.f32   q4, q8, q1                \n"    \
+  "vadd.f32   q6, q9, q1                \n"    \
+  "vmul.f32   q5, q8, q2                \n"    \
+  "vmul.f32   q7, q9, q2                \n"    \
+  "vmax.f32   q4, q4, q0                \n"    \
+  "vmax.f32   q6, q6, q0                \n"    \
+  "vmin.f32   q4, q4, q3                \n"    \
+  "vmin.f32   q6, q6, q3                \n"    \
+  "vmul.f32   q8, q5, q4                \n"    \
+  "vmul.f32   q9, q7, q6                \n"    \
+  "9:\n"
+
+#define SPARSE_F32_F32_W24_v7_SEMI1_LEAKY_RELU                  \
+  /* do relu */                                                 \
+  "11: \n"                                                      \
+  "cmp   %[vflag_act],  #3       \n"    /* check leakey relu */ \
+  "bne   12f                     \n"    /* no act end */        \
+  "vmov.i32   q0, #0\n"                 /* for relu */          \
+  "vdup.32    q1,  %[valpha]\n"         /* leakey relu alpha */ \
+  "vcge.f32   q2,    q10,    q0     \n" /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q10,    q1     \n" /* vmulq_f32 */         \
+  "vcge.f32   q4,    q11,    q0     \n" /* vcgeq_f32 */         \
+  "vmul.f32   q5,    q11,    q1     \n" /* vmulq_f32 */         \
+  "vcge.f32   q6,    q12,    q0     \n" /* vcgeq_f32 */         \
+  "vmul.f32   q7,    q12,    q1     \n" /* vmulq_f32 */         \
+  "vcge.f32   q8,    q13,    q0     \n" /* vcgeq_f32 */         \
+  "vmul.f32   q9,    q13,    q1     \n" /* vmulq_f32 */         \
+  "vbif       q10,    q3,    q2     \n"                         \
+  "vbif       q11,    q5,    q4     \n"                         \
+  "vbif       q12,    q7,    q6     \n"                         \
+  "vbif       q13,    q9,    q8     \n"                         \
+  "vcge.f32   q2,    q14,    q0     \n" /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q14,    q1     \n" /* vmulq_f32 */         \
+  "vcge.f32   q4,    q15,    q0     \n" /* vcgeq_f32 */         \
+  "vmul.f32   q5,    q15,    q1     \n" /* vmulq_f32 */         \
+  "vbif       q14,    q3,    q2     \n"                         \
+  "vbif       q15,    q5,    q4     \n"                         \
+  "b      9f                    \n"
+
+#define SPARSE_F32_F32_W24_v7_SEMI1_HARD_SWISH \
+  "12: \n"                                     \
+  "vmov.u32   q0,   #0                     \n" \
+  "vld1.f32   {d2-d5}, [%[hs_param]]!      \n" \
+  "vld1.f32   {d6-d7}, [%[hs_param]]       \n" \
+  "vadd.f32   q4, q10, q1                \n"   \
+  "vadd.f32   q6, q11, q1                \n"   \
+  "vadd.f32   q8, q12, q1                \n"   \
+  "vmul.f32   q5, q10, q2                \n"   \
+  "vmul.f32   q7, q11, q2                \n"   \
+  "vmul.f32   q9, q12, q2                \n"   \
+  "vmax.f32   q4, q4, q0                \n"    \
+  "vmax.f32   q6, q6, q0                \n"    \
+  "vmax.f32   q8, q8, q0                \n"    \
+  "vmin.f32   q4, q4, q3                \n"    \
+  "vmin.f32   q6, q6, q3                \n"    \
+  "vmin.f32   q8, q8, q3                \n"    \
+  "vmul.f32   q10, q5, q4                \n"   \
+  "vmul.f32   q11, q7, q6                \n"   \
+  "vmul.f32   q12, q9, q8                \n"   \
+  "vadd.f32   q4, q13, q1                \n"   \
+  "vadd.f32   q6, q14, q1                \n"   \
+  "vadd.f32   q8, q15, q1                \n"   \
+  "vmul.f32   q5, q13, q2                \n"   \
+  "vmul.f32   q7, q14, q2                \n"   \
+  "vmul.f32   q9, q15, q2                \n"   \
+  "vmax.f32   q4, q4, q0                \n"    \
+  "vmax.f32   q6, q6, q0                \n"    \
+  "vmax.f32   q8, q8, q0                \n"    \
+  "vmin.f32   q4, q4, q3                \n"    \
+  "vmin.f32   q6, q6, q3                \n"    \
+  "vmin.f32   q8, q8, q3                \n"    \
+  "vmul.f32   q13, q5, q4                \n"   \
+  "vmul.f32   q14, q7, q6                \n"   \
+  "vmul.f32   q15, q9, q8                \n"   \
+  "9:\n"
+
+#define SPARSE_F32_F32_W16_v7_SEMI1_LEAKY_RELU                  \
+  /* do relu */                                                 \
+  "11: \n"                                                      \
+  "cmp   %[vflag_act],  #3       \n"    /* check leakey relu */ \
+  "bne   12f                     \n"    /* no act end */        \
+  "vmov.i32   q0, #0\n"                 /* for relu */          \
+  "vdup.32    q1,  %[valpha]\n"         /* leakey relu alpha */ \
+  "vcge.f32   q2,    q10,    q0     \n" /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q10,    q1     \n" /* vmulq_f32 */         \
+  "vcge.f32   q4,    q11,    q0     \n" /* vcgeq_f32 */         \
+  "vmul.f32   q5,    q11,    q1     \n" /* vmulq_f32 */         \
+  "vcge.f32   q6,    q12,    q0     \n" /* vcgeq_f32 */         \
+  "vmul.f32   q7,    q12,    q1     \n" /* vmulq_f32 */         \
+  "vcge.f32   q8,    q13,    q0     \n" /* vcgeq_f32 */         \
+  "vmul.f32   q9,    q13,    q1     \n" /* vmulq_f32 */         \
+  "vbif       q10,    q3,    q2     \n"                         \
+  "vbif       q11,    q5,    q4     \n"                         \
+  "vbif       q12,    q7,    q6     \n"                         \
+  "vbif       q13,    q9,    q8     \n"                         \
+  "b      9f                    \n"
+
+#define SPARSE_F32_F32_W16_v7_SEMI1_HARD_SWISH \
+  "12: \n"                                     \
+  "vmov.u32   q0,   #0                     \n" \
+  "vld1.f32   {d2-d5}, [%[hs_param]]!      \n" \
+  "vld1.f32   {d6-d7}, [%[hs_param]]       \n" \
+  "vadd.f32   q4, q10, q1                \n"   \
+  "vadd.f32   q6, q11, q1                \n"   \
+  "vadd.f32   q8, q12, q1                \n"   \
+  "vmul.f32   q5, q10, q2                \n"   \
+  "vmul.f32   q7, q11, q2                \n"   \
+  "vmul.f32   q9, q12, q2                \n"   \
+  "vmax.f32   q4, q4, q0                \n"    \
+  "vmax.f32   q6, q6, q0                \n"    \
+  "vmax.f32   q8, q8, q0                \n"    \
+  "vmin.f32   q4, q4, q3                \n"    \
+  "vmin.f32   q6, q6, q3                \n"    \
+  "vmin.f32   q8, q8, q3                \n"    \
+  "vmul.f32   q10, q5, q4                \n"   \
+  "vmul.f32   q11, q7, q6                \n"   \
+  "vmul.f32   q12, q9, q8                \n"   \
+  "vadd.f32   q4, q13, q1                \n"   \
+  "vmul.f32   q5, q13, q2                \n"   \
+  "vmax.f32   q4, q4, q0                \n"    \
+  "vmin.f32   q4, q4, q3                \n"    \
+  "vmul.f32   q13, q5, q4                \n"   \
+  "9:\n"
+
+#define SPARSE_F32_F32_W8_v7_SEMI1_LEAKY_RELU                   \
+  /* do relu */                                                 \
+  "11: \n"                                                      \
+  "cmp   %[vflag_act],  #3       \n"    /* check leakey relu */ \
+  "bne   12f                     \n"    /* no act end */        \
+  "vmov.i32   q0, #0\n"                 /* for relu */          \
+  "vdup.32    q1,  %[valpha]\n"         /* leakey relu alpha */ \
+  "vcge.f32   q2,    q10,    q0     \n" /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q10,    q1     \n" /* vmulq_f32 */         \
+  "vcge.f32   q4,    q11,    q0     \n" /* vcgeq_f32 */         \
+  "vmul.f32   q5,    q11,    q1     \n" /* vmulq_f32 */         \
+  "vbif       q10,    q3,    q2     \n"                         \
+  "vbif       q11,    q5,    q4     \n"                         \
+  "b      9f                    \n"
+
+#define SPARSE_F32_F32_W8_v7_SEMI1_HARD_SWISH  \
+  "12: \n"                                     \
+  "vmov.u32   q0,   #0                     \n" \
+  "vld1.f32   {d2-d5}, [%[hs_param]]!      \n" \
+  "vld1.f32   {d6-d7}, [%[hs_param]]       \n" \
+  "vadd.f32   q4, q10, q1                \n"   \
+  "vadd.f32   q6, q11, q1                \n"   \
+  "vmul.f32   q5, q10, q2                \n"   \
+  "vmul.f32   q7, q11, q2                \n"   \
+  "vmax.f32   q4, q4, q0                \n"    \
+  "vmax.f32   q6, q6, q0                \n"    \
+  "vmin.f32   q4, q4, q3                \n"    \
+  "vmin.f32   q6, q6, q3                \n"    \
+  "vmul.f32   q10, q5, q4                \n"   \
+  "vmul.f32   q11, q7, q6                \n"   \
+  "9:\n"
+
+#define SPARSE_F32_F32_W4_v7_SEMI1_LEAKY_RELU                   \
+  /* do relu */                                                 \
+  "11: \n"                                                      \
+  "cmp   %[vflag_act],  #3       \n"    /* check leakey relu */ \
+  "bne   12f                     \n"    /* no act end */        \
+  "vmov.i32   q0, #0\n"                 /* for relu */          \
+  "vdup.32    q1,  %[valpha]\n"         /* leakey relu alpha */ \
+  "vcge.f32   q2,    q10,    q0     \n" /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q10,    q1     \n" /* vmulq_f32 */         \
+  "vbif       q10,    q3,    q2     \n"                         \
+  "b      9f                    \n"
+
+#define SPARSE_F32_F32_W4_v7_SEMI1_HARD_SWISH  \
+  "12: \n"                                     \
+  "vmov.u32   q0,   #0                     \n" \
+  "vld1.f32   {d2-d5}, [%[hs_param]]!      \n" \
+  "vld1.f32   {d6-d7}, [%[hs_param]]       \n" \
+  "vadd.f32   q4, q10, q1                \n"   \
+  "vmul.f32   q5, q10, q2                \n"   \
+  "vmax.f32   q4, q4, q0                \n"    \
+  "vmin.f32   q4, q4, q3                \n"    \
+  "vmul.f32   q10, q5, q4                \n"   \
+  "9:\n"
+
+#define SPARSE_F32_F32_W24_v7_SEMI2_OUT                                        \
+  SPARSE_F32_F32_W24_v7_SEMI2_KERNEL SPARSE_F32_F32_W24_v7_SEMI2_RELU          \
+      SPARSE_F32_F32_W24_v7_SEMI2_RELU6 SPARSE_F32_F32_W24_v7_SEMI2_LEAKY_RELU \
+          SPARSE_F32_F32_W24_v7_SEMI2_HARD_SWISH                               \
+      "mov   r0,   %[c_ptr1]\n"                                                \
+      "mov   r1,   %[c_ptr2]\n" /* store result */                             \
+      "vst1.32   {d8-d11},  [r0]\n"                                            \
+      "add  r0,  r0,   #32\n"                                                  \
+      "vst1.32   {d12-d15},  [r0]\n"                                           \
+      "add  r0,  r0,   #32\n"                                                  \
+      "vst1.32   {d16-d19},  [r0]\n"                                           \
+      "vst1.32   {d20-d23},  [r1]\n"                                           \
+      "add  r1,  r1,   #32\n"                                                  \
+      "vst1.32   {d24-d27},  [r1]\n"                                           \
+      "add  r1,  r1,   #32\n"                                                  \
+      "vst1.32   {d28-d31},  [r1]\n"
+
+#define SPARSE_F32_F32_W16_v7_SEMI2_OUT                                        \
+  SPARSE_F32_F32_W16_v7_SEMI2_KERNEL SPARSE_F32_F32_W16_v7_SEMI2_RELU          \
+      SPARSE_F32_F32_W16_v7_SEMI2_RELU6 SPARSE_F32_F32_W16_v7_SEMI2_LEAKY_RELU \
+          SPARSE_F32_F32_W16_v7_SEMI2_HARD_SWISH                               \
+      "mov   r0,   %[c_ptr1]\n"                                                \
+      "mov   r1,   %[c_ptr2]\n" /* store result */                             \
+      "vst1.32   {d16-d19},  [r0]\n"                                           \
+      "add  r0,  r0,   #32\n"                                                  \
+      "vst1.32   {d20-d23},  [r0]\n"                                           \
+      "vst1.32   {d24-d27},  [r1]\n"                                           \
+      "add  r1,  r1,   #32\n"                                                  \
+      "vst1.32   {d28-d31},  [r1]\n"
+
+#define SPARSE_F32_F32_W8_v7_SEMI2_OUT                                       \
+  SPARSE_F32_F32_W8_v7_SEMI2_KERNEL SPARSE_F32_F32_W8_v7_SEMI2_RELU          \
+      SPARSE_F32_F32_W8_v7_SEMI2_RELU6 SPARSE_F32_F32_W8_v7_SEMI2_LEAKY_RELU \
+          SPARSE_F32_F32_W8_v7_SEMI2_HARD_SWISH /* store result */           \
+      "vst1.32   {d16-d19},  [%[c_ptr1]]\n"                                  \
+      "vst1.32   {d20-d23},  [%[c_ptr2]]\n"
+
+#define SPARSE_F32_F32_W4_v7_SEMI2_OUT                                       \
+  SPARSE_F32_F32_W4_v7_SEMI2_KERNEL SPARSE_F32_F32_W4_v7_SEMI2_RELU          \
+      SPARSE_F32_F32_W4_v7_SEMI2_RELU6 SPARSE_F32_F32_W4_v7_SEMI2_LEAKY_RELU \
+          SPARSE_F32_F32_W4_v7_SEMI2_HARD_SWISH /* store result */           \
+      "vst1.32   {d16-d17},  [%[c_ptr1]]\n"                                  \
+      "vst1.32   {d18-d19},  [%[c_ptr2]]\n"
+
+#define SPARSE_F32_F32_W24_v7_SEMI1_OUT                                        \
+  SPARSE_F32_F32_W24_v7_SEMI1_KERNEL SPARSE_F32_F32_W24_v7_SEMI1_RELU          \
+      SPARSE_F32_F32_W24_v7_SEMI1_RELU6 SPARSE_F32_F32_W24_v7_SEMI1_LEAKY_RELU \
+          SPARSE_F32_F32_W24_v7_SEMI1_HARD_SWISH                               \
+      "mov   r0,   %[c_ptr]\n" /* store result */                              \
+      "vst1.32   {d20-d23},  [r0]\n"                                           \
+      "add  r0,  r0,   #32\n"                                                  \
+      "vst1.32   {d24-d27},  [r0]\n"                                           \
+      "add  r0,  r0,   #32\n"                                                  \
+      "vst1.32   {d28-d31},  [r0]\n"
+
+#define SPARSE_F32_F32_W16_v7_SEMI1_OUT                                        \
+  SPARSE_F32_F32_W16_v7_SEMI1_KERNEL SPARSE_F32_F32_W16_v7_SEMI1_RELU          \
+      SPARSE_F32_F32_W16_v7_SEMI1_RELU6 SPARSE_F32_F32_W16_v7_SEMI1_LEAKY_RELU \
+          SPARSE_F32_F32_W16_v7_SEMI1_HARD_SWISH                               \
+      "mov   r0,   %[c_ptr]\n" /* store result */                              \
+      "vst1.32   {d20-d23},  [r0]\n"                                           \
+      "add  r0,  r0,   #32\n"                                                  \
+      "vst1.32   {d24-d27},  [r0]\n"
+
+#define SPARSE_F32_F32_W8_v7_SEMI1_OUT                                       \
+  SPARSE_F32_F32_W8_v7_SEMI1_KERNEL SPARSE_F32_F32_W8_v7_SEMI1_RELU          \
+      SPARSE_F32_F32_W8_v7_SEMI1_RELU6 SPARSE_F32_F32_W8_v7_SEMI1_LEAKY_RELU \
+          SPARSE_F32_F32_W8_v7_SEMI1_HARD_SWISH /* store result */           \
+      "vst1.32   {d20-d23},  [%[c_ptr]]\n"
+
+#define SPARSE_F32_F32_W4_v7_SEMI1_OUT                                       \
+  SPARSE_F32_F32_W4_v7_SEMI1_KERNEL SPARSE_F32_F32_W4_v7_SEMI1_RELU          \
+      SPARSE_F32_F32_W4_v7_SEMI1_RELU6 SPARSE_F32_F32_W4_v7_SEMI1_LEAKY_RELU \
+          SPARSE_F32_F32_W4_v7_SEMI1_HARD_SWISH /* store result */           \
+      "vst1.32   {d20-d21},  [%[c_ptr]]\n"
+
+#define INIT_SEMI_CONV_PARAM(Dtype_i, Dtype_o, start_w)                     \
+  auto act_param = param.activation_param;                                  \
+  auto act_type = act_param.active_type;                                    \
+  volatile float alpha = 0.f;                                               \
+  int flag_act = 0x00; /* relu: 1, relu6: 2, leakey: 3  */                  \
+  float vhs_param[12] = {0.f};                                              \
+  if (act_param.has_active) {                                               \
+    if (act_type == lite_api::ActivationType::kRelu) {                      \
+      flag_act = 0x01;                                                      \
+    } else if (act_type == lite_api::ActivationType::kRelu6) {              \
+      flag_act = 0x02;                                                      \
+      alpha = act_param.Relu_clipped_coef;                                  \
+    } else if (act_type == lite_api::ActivationType::kLeakyRelu) {          \
+      flag_act = 0x03;                                                      \
+      alpha = act_param.Leaky_relu_alpha;                                   \
+    } else if (act_type == lite_api::ActivationType::kHardSwish) {          \
+      flag_act = 0x04;                                                      \
+      for (int i = 0; i < 4; i++) {                                         \
+        vhs_param[i] = act_param.hard_swish_offset;                         \
+        vhs_param[i + 4] = 1.0 / act_param.hard_swish_scale;                \
+        vhs_param[i + 8] = act_param.hard_swish_threshold;                  \
+      }                                                                     \
+    }                                                                       \
+  }                                                                         \
+  int flag_bias = (bias != nullptr) ? 1 : 0;                                \
+  size_t mc = N * sizeof(Dtype_i);                                          \
+  size_t nc = M;                                                            \
+  size_t output_stride = N * sizeof(Dtype_o);                               \
+  size_t output_decrement = output_stride * nc - start_w * sizeof(Dtype_o); \
+  size_t pair_num = M / 2;                                                  \
+  size_t lave_num = M % 2;
+
+#define GET_SEMI_PARAM_TABLE(Dtype_i, Dtype_o)                                 \
+  Dtype_o* out_ptr1 =                                                          \
+      reinterpret_cast<Dtype_o*>((uintptr_t)output + output_stride * 2 * i);   \
+  Dtype_o* out_ptr2 = reinterpret_cast<Dtype_o*>((uintptr_t)output +           \
+                                                 output_stride * (2 * i + 1)); \
+  const Dtype_i* cur_w = A;                                                    \
+  uint32_t nnz = nidx_nnzmap[i];                                               \
+  const Dtype_i* cur_b = B;                                                    \
+  const int32_t* dmap = widx_dmap;                                             \
+  if (i != 0) {                                                                \
+    cur_w = A + nidx_nnzmap[i - 1] * 2;                                        \
+    nnz = nidx_nnzmap[i] - nidx_nnzmap[i - 1];                                 \
+    cur_b += ((nidx_nnzmap[i - 1] == 0)                                        \
+                  ? 0                                                          \
+                  : (widx_dmap[nidx_nnzmap[i - 1] - 1] / sizeof(Dtype_i)));    \
+    dmap = widx_dmap + nidx_nnzmap[i - 1];                                     \
+  }                                                                            \
+  const float* pbias = (bias != nullptr) ? (bias + i * 2) : bias_zero;
+
+#define GET_UNSTRUCT_PARAM_TABLE(Dtype_i, Dtype_o)                             \
+  Dtype_o* out_ptr = reinterpret_cast<Dtype_o*>((uintptr_t)output +            \
+                                                output_stride * 2 * pair_num); \
+  const Dtype_i* cur_w = A;                                                    \
+  uint32_t nnz = nidx_nnzmap[pair_num];                                        \
+  const Dtype_i* cur_b = B;                                                    \
+  const int32_t* dmap = widx_dmap;                                             \
+  if (pair_num != 0) {                                                         \
+    cur_w = A + nidx_nnzmap[pair_num - 1] * 2;                                 \
+    nnz = nidx_nnzmap[pair_num] - nidx_nnzmap[pair_num - 1];                   \
+    cur_b +=                                                                   \
+        ((nidx_nnzmap[pair_num - 1] == 0)                                      \
+             ? 0                                                               \
+             : (widx_dmap[nidx_nnzmap[pair_num - 1] - 1] / sizeof(Dtype_i)));  \
+    dmap = widx_dmap + nidx_nnzmap[pair_num - 1];                              \
+  }                                                                            \
+  float vbias = (bias != nullptr) ? bias[pair_num * 2] : 0.0;
+
+#define SET_ASSM_INPUT_PARAM1_V7_F32 \
+: [a_ptr] "+r"(cur_w),                  \
+  [b_ptr] "+r"(cur_b),                  \
+  [c_ptr1] "+r"(out_ptr1),              \
+  [c_ptr2] "+r"(out_ptr2),              \
+  [k] "+r"(nnz),                        \
+  [widx_dmap] "+r"(dmap),               \
+  [bias_ptr] "+r"(pbias),               \
+  [hs_param] "+r"(hs_param)             \
+: [vflag_act] "r"(flag_act),            \
+  [valpha] "r"(alpha)
+
+#define SET_ASSM_INPUT_PARAM2_V7_F32 \
+: [a_ptr] "+r"(cur_w),                  \
+  [b_ptr] "+r"(cur_b),                  \
+  [c_ptr] "+r"(out_ptr),                \
+  [k] "+r"(nnz),                        \
+  [widx_dmap] "+r"(dmap),               \
+  [hs_param] "+r"(hs_param)             \
+: [vbias] "r"(vbias),                   \
+  [vflag_act] "r"(flag_act),            \
+  [valpha] "r"(alpha)
+
+#define COMPUTE_ACT_NEON_TWO_V7_F32                                      \
+  if (flag_act == 1) {                                                   \
+    float32x2_t vzero = vdup_n_f32(0);                                   \
+    vacc01n0 = vmax_f32(vacc01n0, vzero);                                \
+    vacc01n1 = vmax_f32(vacc01n1, vzero);                                \
+  } else if (flag_act == 2) {                                            \
+    float32x2_t vzero = vdup_n_f32(0);                                   \
+    float32x2_t aph = vdup_n_f32(alpha);                                 \
+    vacc01n0 = vmax_f32(vacc01n0, vzero);                                \
+    vacc01n1 = vmax_f32(vacc01n1, vzero);                                \
+    vacc01n0 = vmin_f32(vacc01n0, aph);                                  \
+    vacc01n1 = vmin_f32(vacc01n1, aph);                                  \
+  } else if (flag_act == 0) {                                            \
+  } else if (flag_act == 3) {                                            \
+    float32x2_t vzero = vdup_n_f32(0);                                   \
+    float32x2_t aph = vdup_n_f32(alpha);                                 \
+    uint32x2_t vflag0123 = vcge_f32(vacc01n0, vzero);                    \
+    uint32x2_t vflag4567 = vcge_f32(vacc01n1, vzero);                    \
+    float32x2_t v0123 = vmul_f32(vacc01n0, aph);                         \
+    float32x2_t v4567 = vmul_f32(vacc01n1, aph);                         \
+    vacc01n0 = vbsl_f32(vflag0123, vacc01n0, v0123);                     \
+    vacc01n1 = vbsl_f32(vflag4567, vacc01n1, v4567);                     \
+  } else {                                                               \
+    float32x2_t vzero = vdup_n_f32(0);                                   \
+    float32x2_t offset = vdup_n_f32(act_param.hard_swish_offset);        \
+    float32x2_t hs_scale = vdup_n_f32(1.0 / act_param.hard_swish_scale); \
+    float32x2_t thre = vdup_n_f32(act_param.hard_swish_threshold);       \
+    float32x2_t vset0123_1 = vadd_f32(vacc01n0, offset);                 \
+    float32x2_t vscale0123_1 = vmul_f32(vacc01n0, hs_scale);             \
+    float32x2_t vset0123_2 = vadd_f32(vacc01n1, offset);                 \
+    float32x2_t vscale0123_2 = vmul_f32(vacc01n1, hs_scale);             \
+    vset0123_1 = vmax_f32(vset0123_1, vzero);                            \
+    vset0123_1 = vmin_f32(vset0123_1, thre);                             \
+    vset0123_2 = vmax_f32(vset0123_2, vzero);                            \
+    vset0123_2 = vmin_f32(vset0123_2, thre);                             \
+    vacc01n0 = vmul_f32(vscale0123_1, vset0123_1);                       \
+    vacc01n1 = vmul_f32(vscale0123_2, vset0123_2);                       \
+  }
+
+#define COMPUTE_ACT_NEON_ONE_V7_F32                                      \
+  if (flag_act == 1) {                                                   \
+    float32x2_t vzero = vdup_n_f32(0);                                   \
+    vacc01n0 = vmax_f32(vacc01n0, vzero);                                \
+  } else if (flag_act == 2) {                                            \
+    float32x2_t vzero = vdup_n_f32(0);                                   \
+    float32x2_t aph = vdup_n_f32(alpha);                                 \
+    vacc01n0 = vmax_f32(vacc01n0, vzero);                                \
+    vacc01n0 = vmin_f32(vacc01n0, aph);                                  \
+  } else if (flag_act == 0) {                                            \
+  } else if (flag_act == 3) {                                            \
+    float32x2_t vzero = vdup_n_f32(0);                                   \
+    float32x2_t aph = vdup_n_f32(alpha);                                 \
+    uint32x2_t vflag0123 = vcge_f32(vacc01n0, vzero);                    \
+    float32x2_t v0123 = vmul_f32(vacc01n0, aph);                         \
+    vacc01n0 = vbsl_f32(vflag0123, vacc01n0, v0123);                     \
+  } else {                                                               \
+    float32x2_t vzero = vdup_n_f32(0);                                   \
+    float32x2_t offset = vdup_n_f32(act_param.hard_swish_offset);        \
+    float32x2_t hs_scale = vdup_n_f32(1.0 / act_param.hard_swish_scale); \
+    float32x2_t thre = vdup_n_f32(act_param.hard_swish_threshold);       \
+    float32x2_t vset0123 = vadd_f32(vacc01n0, offset);                   \
+    float32x2_t vscale0123 = vmul_f32(vacc01n0, hs_scale);               \
+    vset0123 = vmax_f32(vset0123, vzero);                                \
+    vset0123 = vmin_f32(vset0123, thre);                                 \
+    vacc01n0 = vmul_f32(vscale0123, vset0123);                           \
+  }
+
+/**
+ * \brief Semi-structured Sparse calculation implementation of 1x1 convolution,
+ * both input and
+ * output are f32.
+ * Semi-structured Sparse matrix multiplication is calculated in blocks, the
+ * block size is Mx24,
+ * that is,
+ * the parameter matrix is MxK, and the activation matrix is Kx24; when N is
+ * less than 24,
+ * it is calculated in blocks of Mx16, Mx8, and Mx4 in turn;
+ * @param A Semi-structured Sparse weight data
+ * @param B dense input data
+ * @param widx_dmap An array of int32_t values storing scaled [by sizeof(input
+ * element)] difference
+ * between input channels corresponding to successive non-zero element
+ * @param nidx_nnzmap the number of non-zero kernel elements per each output
+ * channel
+ * @param bias
+ * @param output
+ * @param M
+ * @param N
+ * @param K
+ * @param param
+ * @param ctx
+ */
+void sparse_semi_conv_fp32_pipelined(const float* A,
+                                     const float* B,
+                                     const int32_t* widx_dmap,
+                                     const uint32_t* nidx_nnzmap,
+                                     const float* bias,
+                                     float* output,
+                                     const int M,
+                                     const int K,
+                                     const int N,
+                                     const operators::SparseConvParam& param,
+                                     ARMContext* ctx) {
+  INIT_SEMI_CONV_PARAM(float, float, 24)
+  float bias_zero[2] = {0.f, 0.f};
+  while
+    SPARSE_LIKELY(mc >= 24 * sizeof(float)) {
+      LITE_PARALLEL_COMMON_BEGIN(i, tid, pair_num, 0, 1) {
+        GET_SEMI_PARAM_TABLE(float, float)
+        float* hs_param = vhs_param;
+        // clang-format off
+        asm volatile(SPARSE_F32_F32_W24_v7_SEMI2_OUT
+          SET_ASSM_INPUT_PARAM1_V7_F32
+          :  "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10",
+              "q11", "q12", "q13", "q14", "q15", "r0", "r1", "cc", "memory");
+        // clang-format on
+      }
+      LITE_PARALLEL_COMMON_END();
+      if
+        SPARSE_UNLIKELY(lave_num != 0) {
+          GET_UNSTRUCT_PARAM_TABLE(float, float)
+          float* hs_param = vhs_param;
+          // clang-format off
+        asm volatile(SPARSE_F32_F32_W24_v7_SEMI1_OUT
+          SET_ASSM_INPUT_PARAM2_V7_F32
+          :  "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10",
+              "q11", "q12", "q13", "q14", "q15", "r0", "r1", "r2", "cc", "memory");
+          // clang-format on
+        }
+      output = reinterpret_cast<float*>((uintptr_t)output + 24 * sizeof(float));
+      B += 24;
+      mc -= 24 * sizeof(float);
+    }
+
+  if
+    SPARSE_UNLIKELY(mc != 0) {
+      output_decrement += 16 * sizeof(float);
+      if (mc & (16 * sizeof(float))) {
+        LITE_PARALLEL_COMMON_BEGIN(i, tid, pair_num, 0, 1) {
+          GET_SEMI_PARAM_TABLE(float, float)
+          float* hs_param = vhs_param;
+          // clang-format off
+          asm volatile(SPARSE_F32_F32_W16_v7_SEMI2_OUT
+            SET_ASSM_INPUT_PARAM1_V7_F32
+            : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10",
+              "q11", "q12", "q13", "q14", "q15", "r0", "r1", "cc", "memory");
+          // clang-format on
+        }
+        LITE_PARALLEL_COMMON_END();
+        if
+          SPARSE_UNLIKELY(lave_num != 0) {
+            GET_UNSTRUCT_PARAM_TABLE(float, float)
+            float* hs_param = vhs_param;
+            // clang-format off
+          asm volatile(SPARSE_F32_F32_W16_v7_SEMI1_OUT
+            SET_ASSM_INPUT_PARAM2_V7_F32
+            : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10",
+              "q11", "q12", "q13", "r0", "r1", "cc", "memory");
+            // clang-format on
+          }
+        output =
+            reinterpret_cast<float*>((uintptr_t)output + 16 * sizeof(float));
+        B += 16;
+        mc -= 16 * sizeof(float);
+      }
+      output_decrement += 8 * sizeof(float);
+      if (mc & (8 * sizeof(float))) {
+        LITE_PARALLEL_COMMON_BEGIN(i, tid, pair_num, 0, 1) {
+          GET_SEMI_PARAM_TABLE(float, float)
+          float* hs_param = vhs_param;
+          // clang-format off
+          asm volatile(SPARSE_F32_F32_W8_v7_SEMI2_OUT
+            SET_ASSM_INPUT_PARAM1_V7_F32
+            : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10",
+              "q11", "r0", "r1", "cc", "memory");
+          // clang-format on
+        }
+        LITE_PARALLEL_COMMON_END();
+        if
+          SPARSE_UNLIKELY(lave_num != 0) {
+            GET_UNSTRUCT_PARAM_TABLE(float, float)
+            float* hs_param = vhs_param;
+            // clang-format off
+          asm volatile(SPARSE_F32_F32_W8_v7_SEMI1_OUT
+            SET_ASSM_INPUT_PARAM2_V7_F32
+            : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10", "q11",
+              "r0", "r1", "cc", "memory");
+            // clang-format on
+          }
+        output =
+            reinterpret_cast<float*>((uintptr_t)output + 8 * sizeof(float));
+        B += 8;
+        mc -= 8 * sizeof(float);
+      }
+      output_decrement += 4 * sizeof(float);
+      if (mc & (4 * sizeof(float))) {
+        LITE_PARALLEL_COMMON_BEGIN(i, tid, pair_num, 0, 1) {
+          GET_SEMI_PARAM_TABLE(float, float)
+          float* hs_param = vhs_param;
+          // clang-format off
+          asm volatile(SPARSE_F32_F32_W4_v7_SEMI2_OUT
+            SET_ASSM_INPUT_PARAM1_V7_F32
+            : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
+              "r0", "r1", "cc", "memory");
+          // clang-format on
+        }
+        LITE_PARALLEL_COMMON_END();
+        if
+          SPARSE_UNLIKELY(lave_num != 0) {
+            GET_UNSTRUCT_PARAM_TABLE(float, float)
+            float* hs_param = vhs_param;
+            // clang-format off
+          asm volatile(SPARSE_F32_F32_W4_v7_SEMI1_OUT
+            SET_ASSM_INPUT_PARAM2_V7_F32
+            : "q0", "q1", "q2", "q3", "q4", "q5", "q10", "r0", "r1", "cc", "memory");
+            // clang-format on
+          }
+        output =
+            reinterpret_cast<float*>((uintptr_t)output + 4 * sizeof(float));
+        B += 4;
+        mc -= 4 * sizeof(float);
+      }
+      output_decrement += 2 * sizeof(float);
+      if (mc & (2 * sizeof(float))) {
+        LITE_PARALLEL_COMMON_BEGIN(i, tid, pair_num, 0, 1) {
+          GET_SEMI_PARAM_TABLE(float, float)
+          float32x2_t vacc01n0 = vld1_dup_f32(pbias);
+          float32x2_t vacc01n1 = vld1_dup_f32(pbias + 1);
+          if
+            SPARSE_LIKELY(nnz != 0) {
+              do {
+                const intptr_t diff = *dmap++;
+                const float32x2_t vi01 = vld1_f32(cur_b);
+                cur_b = (const float*)((uintptr_t)cur_b + (uintptr_t)diff);
+                const float32x2_t vw = vld1_f32(cur_w);
+                cur_w += 2;
+
+                vacc01n0 = vmla_lane_f32(vacc01n0, vi01, vw, 0);
+                vacc01n1 = vmla_lane_f32(vacc01n1, vi01, vw, 1);
+              } while (--nnz != 0);
+            }
+
+          COMPUTE_ACT_NEON_TWO_V7_F32
+          vst1_f32(out_ptr1, vacc01n0);
+          vst1_f32(out_ptr2, vacc01n1);
+        }
+        LITE_PARALLEL_COMMON_END();
+        if
+          SPARSE_UNLIKELY(lave_num != 0) {
+            GET_UNSTRUCT_PARAM_TABLE(float, float)
+            float32x2_t vacc01n0 = vdup_n_f32(vbias);
+            if
+              SPARSE_LIKELY(nnz != 0) {
+                do {
+                  const intptr_t diff = *dmap++;
+                  const float32x2_t vi01 = vld1_f32(cur_b);
+                  cur_b = (const float*)((uintptr_t)cur_b + (uintptr_t)diff);
+                  const float32x2_t vw = vld1_dup_f32(cur_w);
+                  cur_w += 1;
+                  vacc01n0 = vmla_lane_f32(vacc01n0, vi01, vw, 0);
+                } while (--nnz != 0);
+              }
+            COMPUTE_ACT_NEON_ONE_V7_F32
+            vst1_f32(out_ptr, vacc01n0);
+          }
+        output =
+            reinterpret_cast<float*>((uintptr_t)output + 2 * sizeof(float));
+        B += 2;
+        mc -= 2 * sizeof(float);
+      }
+
+      output_decrement += 1 * sizeof(float);
+      if (mc & (1 * sizeof(float))) {
+        LITE_PARALLEL_COMMON_BEGIN(i, tid, pair_num, 0, 1) {
+          GET_SEMI_PARAM_TABLE(float, float)
+          float32x2_t vacc01n0 = vld1_dup_f32(pbias);
+          float32x2_t vacc01n1 = vld1_dup_f32(pbias + 1);
+          if
+            SPARSE_LIKELY(nnz != 0) {
+              do {
+                const intptr_t diff = *dmap++;
+                const float32x2_t vi01 = vld1_dup_f32(cur_b);
+                cur_b = (const float*)((uintptr_t)cur_b + (uintptr_t)diff);
+                const float32x2_t vw = vld1_f32(cur_w);
+                cur_w += 2;
+
+                vacc01n0 = vmla_lane_f32(vacc01n0, vi01, vw, 0);
+                vacc01n1 = vmla_lane_f32(vacc01n1, vi01, vw, 1);
+              } while (--nnz != 0);
+            }
+
+          COMPUTE_ACT_NEON_TWO_V7_F32
+          vst1_lane_f32(out_ptr1, vacc01n0, 0);
+          vst1_lane_f32(out_ptr2, vacc01n1, 0);
+        }
+        LITE_PARALLEL_COMMON_END();
+        if
+          SPARSE_UNLIKELY(lave_num != 0) {
+            GET_UNSTRUCT_PARAM_TABLE(float, float)
+            float32x2_t vacc01n0 = vdup_n_f32(vbias);
+            if
+              SPARSE_LIKELY(nnz != 0) {
+                do {
+                  const intptr_t diff = *dmap++;
+                  const float32x2_t vi01 = vld1_dup_f32(cur_b);
+                  cur_b = (const float*)((uintptr_t)cur_b + (uintptr_t)diff);
+                  const float32x2_t vw = vld1_dup_f32(cur_w);
+                  cur_w += 1;
+                  vacc01n0 = vmla_lane_f32(vacc01n0, vi01, vw, 0);
+                } while (--nnz != 0);
+              }
+            COMPUTE_ACT_NEON_ONE_V7_F32
+            vst1_lane_f32(out_ptr, vacc01n0, 0);
+          }
+        output =
+            reinterpret_cast<float*>((uintptr_t)output + 1 * sizeof(float));
+        B += 1;
+        mc -= 1 * sizeof(float);
+      }
+    }
+}
+
+#define SPARSE_INT8_F32_W48_v7_SEMI2_KERNEL \
+  "veor   q4,   q0,  q0\n"                  \
+  "veor   q5,   q1,  q1\n"                  \
+  "veor   q6,   q2,  q2\n"                  \
+  "veor   q7,   q3,  q3\n"                  \
+  "pld  [%[a_ptr], #32]    \n"              \
+  "veor   q8,   q0,  q0\n"                  \
+  "veor   q9,   q1,  q1\n"                  \
+  "pld  [%[widx_dmap], #32]    \n"          \
+  "veor   q10,  q2,  q2\n"                  \
+  "veor   q11,  q3,  q3\n"                  \
+  "veor   q12,  q0,  q0\n"                  \
+  "pld  [%[b_ptr], #64]    \n"              \
+  "veor   q13,  q1,  q1\n"                  \
+  "veor   q14,  q2,  q2\n"                  \
+  "veor   q15,  q3,  q3\n"                  \
+  "cmp    %[k],    #0\n"                    \
+  "beq    1f\n" /* main loop*/              \
+  "0:\n"                                    \
+  "ldrsb   r0, [%[a_ptr]]\n"                \
+  "subs    %[k],   %[k],   #1\n"            \
+  "mov   r1,   %[b_ptr]\n"                  \
+  "add   %[a_ptr],  %[a_ptr], %[vdif]\n"    \
+  "vld1.8  {d2-d5}, [r1]\n"                 \
+  "vdup.8    d0,   r0\n"                    \
+  "vmull.s8    q3,    d2,  d0\n"            \
+  "vaddw.s16   q4,    q4,  d6\n"            \
+  "vaddw.s16   q5,    q5,  d7\n"            \
+  "vmull.s8    q3,    d3,  d0\n"            \
+  "pld  [%[widx_dmap], #32]    \n"          \
+  "vaddw.s16   q6,    q6,  d6\n"            \
+  "vaddw.s16   q7,    q7,  d7\n"            \
+  "vmull.s8    q3,    d4,  d0\n"            \
+  "vaddw.s16   q8,    q8, d6\n"             \
+  "vaddw.s16   q9,    q9, d7\n"             \
+  "vmull.s8    q3,    d5,  d0\n"            \
+  "ldr     r0, [%[widx_dmap]],   #4\n"      \
+  "add   %[b_ptr],  %[b_ptr], r0\n"         \
+  "vaddw.s16   q10,   q10, d6\n"            \
+  "vaddw.s16   q11,   q11, d7\n"            \
+  "pld  [%[b_ptr], #64]    \n"              \
+  "add  r1,  r1,   #32\n"                   \
+  "vld1.8  {d2-d3}, [r1]\n"                 \
+  "vmull.s8    q3,    d2,  d0\n"            \
+  "vaddw.s16   q12,   q12,  d6\n"           \
+  "pld  [%[a_ptr], #32]    \n"              \
+  "vaddw.s16   q13,   q13,  d7\n"           \
+  "vmull.s8    q3,    d3,  d0\n"            \
+  "vaddw.s16   q14,   q14,  d6\n"           \
+  "vaddw.s16   q15,   q15,  d7\n"           \
+  "bne     0b\n"                            \
+  "1:\n"
+
+#define SPARSE_INT8_TRANS_INT32_TO_FP32_W48_v7_SEMI2 \
+  /* write output */                                 \
+  "vdup.32    q0,   %[vscale]\n"                     \
+  "vdup.32    q1,   %[vbias]\n"                      \
+  "vcvt.f32.s32   q2, q4\n" /* cvt int32 to fp32*/   \
+  "vcvt.f32.s32   q3, q5\n" /* cvt int32 to fp32*/   \
+  "vdup.32    q4,   d2[0]\n"                         \
+  "vdup.32    q5,   d2[0]\n"                         \
+  "vmla.f32   q4,  q2,  q0\n"                        \
+  "vmla.f32   q5,  q3,  q0\n"                        \
+  "vcvt.f32.s32   q2, q6\n" /* cvt int32 to fp32*/   \
+  "vcvt.f32.s32   q3, q7\n" /* cvt int32 to fp32*/   \
+  "vdup.32    q6,   d2[0]\n"                         \
+  "vdup.32    q7,   d2[0]\n"                         \
+  "vmla.f32   q6,  q2,  q0\n"                        \
+  "vmla.f32   q7,  q3,  q0\n"                        \
+  "vcvt.f32.s32   q2, q8\n" /* cvt int32 to fp32*/   \
+  "vcvt.f32.s32   q3, q9\n" /* cvt int32 to fp32*/   \
+  "vdup.32    q8,   d2[0]\n"                         \
+  "vdup.32    q9,   d2[0]\n"                         \
+  "vmla.f32   q8,  q2,  q0\n"                        \
+  "vmla.f32   q9,  q3,  q0\n"                        \
+  "vcvt.f32.s32   q2, q10\n" /* cvt int32 to fp32*/  \
+  "vcvt.f32.s32   q3, q11\n" /* cvt int32 to fp32*/  \
+  "vdup.32    q10,   d2[0]\n"                        \
+  "vdup.32    q11,   d2[0]\n"                        \
+  "vmla.f32   q10,  q2,  q0\n"                       \
+  "vmla.f32   q11,  q3,  q0\n"                       \
+  "vcvt.f32.s32   q2, q12\n" /* cvt int32 to fp32*/  \
+  "vcvt.f32.s32   q3, q13\n" /* cvt int32 to fp32*/  \
+  "vdup.32    q12,   d2[0]\n"                        \
+  "vdup.32    q13,   d2[0]\n"                        \
+  "vmla.f32   q12,  q2,  q0\n"                       \
+  "vmla.f32   q13,  q3,  q0\n"                       \
+  "vcvt.f32.s32   q2, q14\n" /* cvt int32 to fp32*/  \
+  "vcvt.f32.s32   q3, q15\n" /* cvt int32 to fp32*/  \
+  "vdup.32    q14,   d2[0]\n"                        \
+  "vdup.32    q15,   d2[0]\n"                        \
+  "vmla.f32   q14,  q2,  q0\n"                       \
+  "vmla.f32   q15,  q3,  q0\n"
+
+#define SPARSE_INT8_F32_W48_v7_SEMI2_RELU             \
+  /* do relu */                                       \
+  "cmp    %[vflag_act],    #0\n"     /* skip relu */  \
+  "beq   9f                     \n"  /* no act end */ \
+  "cmp    %[vflag_act],    #1\n"     /* skip relu */  \
+  "bne   10f                     \n" /* other act */  \
+  "vmov.i32   q0, #0\n"              /* for relu */   \
+  "vmax.f32   q4,   q4,   q0\n"      /* relu */       \
+  "vmax.f32   q5,   q5,   q0\n"      /* relu */       \
+  "vmax.f32   q6,   q6,   q0\n"      /* relu */       \
+  "vmax.f32   q7,   q7,   q0\n"      /* relu */       \
+  "vmax.f32   q8,   q8,   q0\n"      /* relu */       \
+  "vmax.f32   q9,   q9,   q0\n"      /* relu */       \
+  "vmax.f32   q10,  q10,  q0\n"      /* relu */       \
+  "vmax.f32   q11,  q11,  q0\n"      /* relu */       \
+  "vmax.f32   q12,  q12,  q0\n"      /* relu */       \
+  "vmax.f32   q13,  q13,  q0\n"      /* relu */       \
+  "vmax.f32   q14,  q14,  q0\n"      /* relu */       \
+  "vmax.f32   q15,  q15,  q0\n"      /* relu */       \
+  "b      9f                    \n"  /* relu end */
+
+#define SPARSE_INT8_F32_W48_v7_SEMI2_RELU6             \
+  /* do relu6 */                                       \
+  "10: \n"                                             \
+  "cmp   %[vflag_act],  #2       \n" /* check relu6 */ \
+  "bne   11f                     \n" /* no act end */  \
+  "vmov.i32   q0,   #0\n"            /* for relu6 */   \
+  "vdup.32    q1,   %[valpha]\n"     /* relu6 alpha */ \
+  "vmax.f32   q4,   q4,   q0\n"      /* relu6 */       \
+  "vmax.f32   q5,   q5,   q0\n"      /* relu6 */       \
+  "vmax.f32   q6,   q6,   q0\n"      /* relu6 */       \
+  "vmax.f32   q7,   q7,   q0\n"      /* relu6 */       \
+  "vmax.f32   q8,   q8,   q0\n"      /* relu6 */       \
+  "vmax.f32   q9,   q9,   q0\n"      /* relu6 */       \
+  "vmax.f32   q10,  q10,  q0\n"      /* relu6 */       \
+  "vmax.f32   q11,  q11,  q0\n"      /* relu6 */       \
+  "vmax.f32   q12,  q12,  q0\n"      /* relu6 */       \
+  "vmax.f32   q13,  q13,  q0\n"      /* relu6 */       \
+  "vmax.f32   q14,  q14,  q0\n"      /* relu6 */       \
+  "vmax.f32   q15,  q15,  q0\n"      /* relu6 */       \
+  "vmin.f32   q4,   q4,   q1\n"      /* relu6 */       \
+  "vmin.f32   q5,   q5,   q1\n"      /* relu6 */       \
+  "vmin.f32   q6,   q6,   q1\n"      /* relu6 */       \
+  "vmin.f32   q7,   q7,   q1\n"      /* relu6 */       \
+  "vmin.f32   q8,   q8,   q1\n"      /* relu6 */       \
+  "vmin.f32   q9,   q9,   q1\n"      /* relu6 */       \
+  "vmin.f32   q10,  q10,  q1\n"      /* relu6 */       \
+  "vmin.f32   q11,  q11,  q1\n"      /* relu6 */       \
+  "vmin.f32   q12,  q12,  q1\n"      /* relu6 */       \
+  "vmin.f32   q13,  q13,  q1\n"      /* relu6 */       \
+  "vmin.f32   q14,  q14,  q1\n"      /* relu6 */       \
+  "vmin.f32   q15,  q15,  q1\n"      /* relu6 */       \
+  "b      9f                    \n"  /* relu end */
+
+#define SPARSE_INT8_F32_W48_v7_SEMI2_LEAKY_RELU                \
+  /* do relu */                                                \
+  "11: \n"                                                     \
+  "cmp   %[vflag_act],  #3       \n"   /* check leakey relu */ \
+  "bne   12f                     \n"   /* no act end */        \
+  "vmov.i32   q0, #0\n"                /* for relu */          \
+  "vdup.32    q1,  %[valpha]\n"        /* leakey relu alpha */ \
+  "vcge.f32   q2,    q4,    q0     \n" /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q4,    q1     \n" /* vmulq_f32 */         \
+  "vbif       q4,    q3,    q2     \n"                         \
+  "vcge.f32   q2,    q5,    q0     \n" /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q5,    q1     \n" /* vmulq_f32 */         \
+  "vbif       q5,    q3,    q2     \n"                         \
+  "vcge.f32   q2,    q6,    q0     \n" /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q6,    q1     \n" /* vmulq_f32 */         \
+  "vbif       q6,    q3,    q2     \n"                         \
+  "vcge.f32   q2,    q7,    q0     \n" /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q7,    q1     \n" /* vmulq_f32 */         \
+  "vbif       q7,    q3,    q2     \n"                         \
+  "vcge.f32   q2,    q8,    q0     \n" /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q8,    q1     \n" /* vmulq_f32 */         \
+  "vbif       q8,    q3,    q2     \n"                         \
+  "vcge.f32   q2,    q9,    q0     \n" /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q9,    q1     \n" /* vmulq_f32 */         \
+  "vbif       q9,    q3,    q2     \n"                         \
+  "vcge.f32   q2,    q10,    q0    \n" /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q10,    q1    \n" /* vmulq_f32 */         \
+  "vbif       q10,    q3,    q2    \n"                         \
+  "vcge.f32   q2,    q11,    q0    \n" /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q11,    q1    \n" /* vmulq_f32 */         \
+  "vbif       q11,    q3,    q2    \n"                         \
+  "vcge.f32   q2,    q12,    q0    \n" /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q12,    q1    \n" /* vmulq_f32 */         \
+  "vbif       q12,    q3,    q2    \n"                         \
+  "vcge.f32   q2,    q13,    q0    \n" /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q13,    q1    \n" /* vmulq_f32 */         \
+  "vbif       q13,    q3,    q2    \n"                         \
+  "vcge.f32   q2,    q14,    q0    \n" /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q14,    q1    \n" /* vmulq_f32 */         \
+  "vbif       q14,    q3,    q2    \n"                         \
+  "vcge.f32   q2,    q15,    q0    \n" /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q15,    q1    \n" /* vmulq_f32 */         \
+  "vbif       q15,    q3,    q2    \n"                         \
+  "b      9f                    \n"
+
+#define SPARSE_INT8_F32_W48_v7_SEMI2_HARD_SWISH                       \
+  /* do relu */                                                       \
+  "12: \n"                                                            \
+  "vld1.f32   {d0-d3}, [%[hs_param]]!      @ load hard swish alpha\n" \
+  "vadd.f32   q3, q4, q0                \n"                           \
+  "vmul.f32   q4, q4, q1                \n"                           \
+  "vmov.u32   q2,   #0                  @ for hardswish \n"           \
+  "vmax.f32   q3, q3, q2                \n"                           \
+  "vld1.f32   {d4-d5}, [%[hs_param]]       \n"                        \
+  "vmin.f32   q3, q3, q2                \n"                           \
+  "vmov.u32   q2,   #0                  @ for hardswish \n"           \
+  "vmul.f32   q4, q4, q3                \n"                           \
+  "vadd.f32   q3, q5, q0                \n"                           \
+  "vmul.f32   q5, q5, q1                \n"                           \
+  "vmax.f32   q3, q3, q2                \n"                           \
+  "vld1.f32   {d4-d5}, [%[hs_param]]       \n"                        \
+  "vmin.f32   q3, q3, q2                \n"                           \
+  "vmov.u32   q2,   #0                  @ for hardswish \n"           \
+  "vmul.f32   q5, q5, q3                \n"                           \
+  "vadd.f32   q3, q6, q0                \n"                           \
+  "vmul.f32   q6, q6, q1                \n"                           \
+  "vmax.f32   q3, q3, q2                \n"                           \
+  "vld1.f32   {d4-d5}, [%[hs_param]]       \n"                        \
+  "vmin.f32   q3, q3, q2                \n"                           \
+  "vmov.u32   q2,   #0                  @ for hardswish \n"           \
+  "vmul.f32   q6, q6, q3                \n"                           \
+  "vadd.f32   q3, q7, q0                \n"                           \
+  "vmul.f32   q7, q7, q1                \n"                           \
+  "vmax.f32   q3, q3, q2                \n"                           \
+  "vld1.f32   {d4-d5}, [%[hs_param]]    \n"                           \
+  "vmin.f32   q3, q3, q2                \n"                           \
+  "vmov.u32   q2,   #0                  @ for hardswish \n"           \
+  "vmul.f32   q7, q7, q3                \n"                           \
+  "vadd.f32   q3, q8, q0                \n"                           \
+  "vmul.f32   q8, q8, q1                \n"                           \
+  "vmax.f32   q3, q3, q2                \n"                           \
+  "vld1.f32   {d4-d5}, [%[hs_param]]    \n"                           \
+  "vmin.f32   q3, q3, q2                \n"                           \
+  "vmov.u32   q2,   #0                  @ for hardswish \n"           \
+  "vmul.f32   q8, q8, q3                \n"                           \
+  "vadd.f32   q3, q9, q0                \n"                           \
+  "vmul.f32   q9, q9, q1                \n"                           \
+  "vmax.f32   q3, q3, q2                \n"                           \
+  "vld1.f32   {d4-d5}, [%[hs_param]]    \n"                           \
+  "vmin.f32   q3, q3, q2                \n"                           \
+  "vmov.u32   q2,   #0                  @ for hardswish \n"           \
+  "vmul.f32   q9, q9, q3                \n"                           \
+  "vadd.f32   q3, q10, q0               \n"                           \
+  "vmul.f32   q10, q10, q1              \n"                           \
+  "vmax.f32   q3, q3, q2                \n"                           \
+  "vld1.f32   {d4-d5}, [%[hs_param]]    \n"                           \
+  "vmin.f32   q3, q3, q2                \n"                           \
+  "vmov.u32   q2,   #0                  @ for hardswish \n"           \
+  "vmul.f32   q10, q10, q3              \n"                           \
+  "vadd.f32   q3, q11, q0               \n"                           \
+  "vmul.f32   q11, q11, q1              \n"                           \
+  "vmax.f32   q3, q3, q2                \n"                           \
+  "vld1.f32   {d4-d5}, [%[hs_param]]    \n"                           \
+  "vmin.f32   q3, q3, q2                \n"                           \
+  "vmov.u32   q2,   #0                  @ for hardswish \n"           \
+  "vmul.f32   q11, q11, q3              \n"                           \
+  "vadd.f32   q3, q12, q0               \n"                           \
+  "vmul.f32   q12, q12, q1              \n"                           \
+  "vmax.f32   q3, q3, q2                \n"                           \
+  "vld1.f32   {d4-d5}, [%[hs_param]]    \n"                           \
+  "vmin.f32   q3, q3, q2                \n"                           \
+  "vmov.u32   q2,   #0                  @ for hardswish \n"           \
+  "vmul.f32   q12, q12, q3              \n"                           \
+  "vadd.f32   q3, q13, q0               \n"                           \
+  "vmul.f32   q13, q13, q1              \n"                           \
+  "vmax.f32   q3, q3, q2                \n"                           \
+  "vld1.f32   {d4-d5}, [%[hs_param]]    \n"                           \
+  "vmin.f32   q3, q3, q2                \n"                           \
+  "vmov.u32   q2,   #0                  @ for hardswish \n"           \
+  "vmul.f32   q13, q13, q3              \n"                           \
+  "vadd.f32   q3, q14, q0               \n"                           \
+  "vmul.f32   q14, q14, q1              \n"                           \
+  "vmax.f32   q3, q3, q2                \n"                           \
+  "vld1.f32   {d4-d5}, [%[hs_param]]    \n"                           \
+  "vmin.f32   q3, q3, q2                \n"                           \
+  "vmov.u32   q2,   #0                  @ for hardswish \n"           \
+  "vmul.f32   q14, q14, q3              \n"                           \
+  "vadd.f32   q3, q15, q0               \n"                           \
+  "vmul.f32   q15, q15, q1              \n"                           \
+  "vmax.f32   q3, q3, q2                \n"                           \
+  "vld1.f32   {d4-d5}, [%[hs_param]]    \n"                           \
+  "vmin.f32   q3, q3, q2                \n"                           \
+  "vmov.u32   q2,   #0                  @ for hardswish \n"           \
+  "vmul.f32   q15, q15, q3              \n"                           \
+  "9:\n"
+
+#define SPARSE_INT8_F32_W48_v7_SEMI2_OUT                                       \
+  SPARSE_INT8_F32_W48_v7_SEMI2_KERNEL                                          \
+      SPARSE_INT8_TRANS_INT32_TO_FP32_W48_v7_SEMI2                             \
+          SPARSE_INT8_F32_W48_v7_SEMI2_RELU SPARSE_INT8_F32_W48_v7_SEMI2_RELU6 \
+              SPARSE_INT8_F32_W48_v7_SEMI2_LEAKY_RELU                          \
+                  SPARSE_INT8_F32_W48_v7_SEMI2_HARD_SWISH                      \
+      "mov   r0,   %[c_ptr]\n" /* store result */                              \
+      "vst1.32   {d8-d11},  [r0]\n"                                            \
+      "add  r0,  r0,   #32\n"                                                  \
+      "vst1.32   {d12-d15},  [r0]\n"                                           \
+      "add  r0,  r0,   #32\n"                                                  \
+      "vst1.32   {d16-d19},  [r0]\n"                                           \
+      "add  r0,  r0,   #32\n"                                                  \
+      "vst1.32   {d20-d23},  [r0]\n"                                           \
+      "add  r0,  r0,   #32\n"                                                  \
+      "vst1.32   {d24-d27},  [r0]\n"                                           \
+      "add  r0,  r0,   #32\n"                                                  \
+      "vst1.32   {d28-d31},  [r0]\n"
+
+#define SPARSE_INT8_F32_W32_v7_SEMI2_KERNEL \
+  "veor   q8,   q0,  q0\n"                  \
+  "veor   q9,   q1,  q1\n"                  \
+  "pld  [%[a_ptr], #32]    \n"              \
+  "veor   q10,  q2,  q2\n"                  \
+  "veor   q11,  q3,  q3\n"                  \
+  "pld  [%[widx_dmap], #32]    \n"          \
+  "veor   q12,  q4,  q4\n"                  \
+  "veor   q13,  q5,  q5\n"                  \
+  "pld  [%[b_ptr], #64]    \n"              \
+  "veor   q14,  q6,  q6\n"                  \
+  "veor   q15,  q7,  q7\n"                  \
+  "cmp    %[k],    #0\n"                    \
+  "beq    1f\n" /* main loop*/              \
+  "0:\n"                                    \
+  "ldrsb   r0, [%[a_ptr]]\n"                \
+  "subs    %[k],   %[k],   #1\n"            \
+  "vld1.8  {d2-d5}, [%[b_ptr]]\n"           \
+  "add   %[a_ptr],  %[a_ptr], %[vdif]\n"    \
+  "vdup.8    d0,   r0\n"                    \
+  "vmull.s8    q3,    d2,  d0\n"            \
+  "vmull.s8    q4,    d3,  d0\n"            \
+  "pld  [%[widx_dmap], #32]    \n"          \
+  "vmull.s8    q5,    d4,  d0\n"            \
+  "vmull.s8    q6,    d5,  d0\n"            \
+  "ldr     r1, [%[widx_dmap]],   #4\n"      \
+  "add   %[b_ptr],  %[b_ptr], r1\n"         \
+  "vaddw.s16   q8,     q8,  d6\n"           \
+  "vaddw.s16   q9,     q9,  d7\n"           \
+  "pld  [%[a_ptr], #32]    \n"              \
+  "vaddw.s16   q10,    q10,  d8\n"          \
+  "vaddw.s16   q11,    q11,  d9\n"          \
+  "vaddw.s16   q12,    q12,  d10\n"         \
+  "pld  [%[b_ptr], #64]    \n"              \
+  "vaddw.s16   q13,    q13,  d11\n"         \
+  "vaddw.s16   q14,    q14,  d12\n"         \
+  "vaddw.s16   q15,    q15,  d13\n"         \
+  "bne     0b\n"                            \
+  "1:\n"
+
+#define SPARSE_INT8_TRANS_INT32_TO_FP32_W32_v7_SEMI2 \
+  /* write output */                                 \
+  "vcvt.f32.s32   q0, q8\n"  /* cvt int32 to fp32*/  \
+  "vcvt.f32.s32   q1, q9\n"  /* cvt int32 to fp32*/  \
+  "vcvt.f32.s32   q2, q10\n" /* cvt int32 to fp32*/  \
+  "vcvt.f32.s32   q3, q11\n" /* cvt int32 to fp32*/  \
+  "vdup.32    q8,   %[vscale]\n"                     \
+  "vdup.32    q4,   %[vbias]\n"                      \
+  "vdup.32    q5,   d8[0]\n"                         \
+  "vdup.32    q6,   d8[0]\n"                         \
+  "vdup.32    q7,   d8[0]\n"                         \
+  "3:\n"                                             \
+  "vmla.f32  q4,  q0,  q8\n"                         \
+  "vmla.f32  q5,  q1,  q8\n"                         \
+  "vmla.f32  q6,  q2,  q8\n"                         \
+  "vmla.f32  q7,  q3,  q8\n"                         \
+  "4:\n"                                             \
+  "vcvt.f32.s32   q8, q12\n"  /* int32 to fp32*/     \
+  "vcvt.f32.s32   q9, q13\n"  /* cvt int32 to fp32*/ \
+  "vcvt.f32.s32   q10, q14\n" /* cvt int32 to fp32*/ \
+  "vcvt.f32.s32   q11, q15\n" /* cvt int32 to fp32*/ \
+  "vdup.32    q12,   %[vscale]\n"                    \
+  "vdup.32    q0,   %[vbias]\n"                      \
+  "vdup.32    q1,   d0[0]\n"                         \
+  "vdup.32    q2,   d0[0]\n"                         \
+  "vdup.32    q3,   d0[0]\n"                         \
+  "6:\n"                                             \
+  "vmla.f32  q0,  q8,  q12\n"                        \
+  "vmla.f32  q1,  q9,  q12\n"                        \
+  "vmla.f32  q2,  q10,  q12\n"                       \
+  "vmla.f32  q3,  q11,  q12\n"
+
+#define SPARSE_INT8_F32_W32_v7_SEMI2_RELU             \
+  /* do relu */                                       \
+  "cmp    %[vflag_act],    #0\n"     /* skip relu */  \
+  "beq   9f                     \n"  /* no act end */ \
+  "cmp    %[vflag_act],    #1\n"     /* skip relu */  \
+  "bne   10f                     \n" /* other act */  \
+  "vmov.i32   q8, #0\n"              /* for relu */   \
+  "vmax.f32   q0,   q0,   q8\n"      /* relu */       \
+  "vmax.f32   q1,   q1,   q8\n"      /* relu */       \
+  "vmax.f32   q2,   q2,   q8\n"      /* relu */       \
+  "vmax.f32   q3,   q3,   q8\n"      /* relu */       \
+  "vmax.f32   q4,   q4,   q8\n"      /* relu */       \
+  "vmax.f32   q5,   q5,   q8\n"      /* relu */       \
+  "vmax.f32   q6,   q6,   q8\n"      /* relu */       \
+  "vmax.f32   q7,   q7,   q8\n"      /* relu */       \
+  "b      9f                    \n"  /* relu end */
+
+#define SPARSE_INT8_F32_W32_v7_SEMI2_RELU6             \
+  /* do relu6 */                                       \
+  "10: \n"                                             \
+  "cmp   %[vflag_act],  #2       \n" /* check relu6 */ \
+  "bne   11f                     \n" /* no act end */  \
+  "vmov.i32   q8,   #0\n"            /* for relu6 */   \
+  "vdup.32    q9,   %[valpha]\n"     /* relu6 alpha */ \
+  "vmax.f32   q0,   q0,   q8\n"      /* relu6 */       \
+  "vmax.f32   q1,   q1,   q8\n"      /* relu6 */       \
+  "vmax.f32   q2,   q2,   q8\n"      /* relu6 */       \
+  "vmax.f32   q3,   q3,   q8\n"      /* relu6 */       \
+  "vmax.f32   q4,   q4,   q8\n"      /* relu6 */       \
+  "vmax.f32   q5,   q5,   q8\n"      /* relu6 */       \
+  "vmax.f32   q6,   q6,   q8\n"      /* relu6 */       \
+  "vmax.f32   q7,   q7,   q8\n"      /* relu6 */       \
+  "vmin.f32   q0,   q0,   q9\n"      /* relu6 */       \
+  "vmin.f32   q1,   q1,   q9\n"      /* relu6 */       \
+  "vmin.f32   q2,   q2,   q9\n"      /* relu6 */       \
+  "vmin.f32   q3,   q3,   q9\n"      /* relu6 */       \
+  "vmin.f32   q4,   q4,   q9\n"      /* relu6 */       \
+  "vmin.f32   q5,   q5,   q9\n"      /* relu6 */       \
+  "vmin.f32   q6,   q6,   q9\n"      /* relu6 */       \
+  "vmin.f32   q7,   q7,   q9\n"      /* relu6 */       \
+  "b      9f                    \n"  /* relu end */
+
+#define SPARSE_INT8_F32_W32_v7_SEMI2_LEAKY_RELU               \
+  /* do relu */                                               \
+  "11: \n"                                                    \
+  "cmp   %[vflag_act],  #3       \n"  /* check leakey relu */ \
+  "bne   12f                     \n"  /* no act end */        \
+  "vmov.i32   q8, #0\n"               /* for relu */          \
+  "vdup.32    q9,  %[valpha]\n"       /* leakey relu alpha */ \
+  "vcge.f32   q10,    q0,    q8   \n" /* vcgeq_f32 */         \
+  "vmul.f32   q11,    q0,    q9   \n" /* vmulq_f32 */         \
+  "vcge.f32   q12,    q1,    q8   \n" /* vcgeq_f32 */         \
+  "vmul.f32   q13,    q1,    q9   \n" /* vmulq_f32 */         \
+  "vcge.f32   q14,    q2,    q8   \n" /* vcgeq_f32 */         \
+  "vmul.f32   q15,    q2,    q9   \n" /* vmulq_f32 */         \
+  "vbif       q0,    q11,    q10   \n"                        \
+  "vbif       q1,    q13,    q12   \n"                        \
+  "vbif       q2,    q15,    q14   \n"                        \
+  "vcge.f32   q10,    q3,    q8   \n" /* vcgeq_f32 */         \
+  "vmul.f32   q11,    q3,    q9   \n" /* vmulq_f32 */         \
+  "vcge.f32   q12,    q4,    q8   \n" /* vcgeq_f32 */         \
+  "vmul.f32   q13,    q4,    q9   \n" /* vmulq_f32 */         \
+  "vcge.f32   q14,    q5,    q8   \n" /* vcgeq_f32 */         \
+  "vmul.f32   q15,    q5,    q9   \n" /* vmulq_f32 */         \
+  "vbif       q3,    q11,    q10   \n"                        \
+  "vbif       q4,    q13,    q12   \n"                        \
+  "vbif       q5,    q15,    q14   \n"                        \
+  "vcge.f32   q10,    q6,    q8   \n" /* vcgeq_f32 */         \
+  "vmul.f32   q11,    q6,    q9   \n" /* vmulq_f32 */         \
+  "vcge.f32   q12,    q7,    q8   \n" /* vcgeq_f32 */         \
+  "vmul.f32   q13,    q7,    q9   \n" /* vmulq_f32 */         \
+  "vbif       q6,    q11,    q10   \n"                        \
+  "vbif       q7,    q13,    q12   \n"                        \
+  "b      9f                    \n"
+
+#define SPARSE_INT8_F32_W32_v7_SEMI2_HARD_SWISH  \
+  "12: \n"                                       \
+  "vmov.u32   q8,   #0                     \n"   \
+  "vld1.f32   {d18-d21}, [%[hs_param]]!      \n" \
+  "vld1.f32   {d22-d23}, [%[hs_param]]       \n" \
+  "vadd.f32   q12, q0, q9                 \n"    \
+  "vadd.f32   q14, q1, q9                 \n"    \
+  "vmul.f32   q13, q0, q10                \n"    \
+  "vmul.f32   q14, q1, q10                \n"    \
+  "vmax.f32   q12, q12, q8                \n"    \
+  "vmax.f32   q14, q14, q8                \n"    \
+  "vmin.f32   q12, q12, q11               \n"    \
+  "vmin.f32   q14, q14, q11               \n"    \
+  "vmul.f32   q0,  q13, q12               \n"    \
+  "vmul.f32   q1,  q15, q14               \n"    \
+  "vadd.f32   q12, q2, q9                 \n"    \
+  "vadd.f32   q14, q3, q9                 \n"    \
+  "vmul.f32   q13, q2, q10                \n"    \
+  "vmul.f32   q14, q3, q10                \n"    \
+  "vmax.f32   q12, q12, q8                \n"    \
+  "vmax.f32   q14, q14, q8                \n"    \
+  "vmin.f32   q12, q12, q11               \n"    \
+  "vmin.f32   q14, q14, q11               \n"    \
+  "vmul.f32   q2,  q13, q12               \n"    \
+  "vmul.f32   q3,  q15, q14               \n"    \
+  "vadd.f32   q12, q4, q9                 \n"    \
+  "vadd.f32   q14, q5, q9                 \n"    \
+  "vmul.f32   q13, q4, q10                \n"    \
+  "vmul.f32   q14, q5, q10                \n"    \
+  "vmax.f32   q12, q12, q8                \n"    \
+  "vmax.f32   q14, q14, q8                \n"    \
+  "vmin.f32   q12, q12, q11               \n"    \
+  "vmin.f32   q14, q14, q11               \n"    \
+  "vmul.f32   q4,  q13, q12               \n"    \
+  "vmul.f32   q5,  q15, q14               \n"    \
+  "vadd.f32   q12, q6, q9                 \n"    \
+  "vadd.f32   q14, q7, q9                 \n"    \
+  "vmul.f32   q13, q6, q10                \n"    \
+  "vmul.f32   q14, q7, q10                \n"    \
+  "vmax.f32   q12, q12, q8                \n"    \
+  "vmax.f32   q14, q14, q8                \n"    \
+  "vmin.f32   q12, q12, q11               \n"    \
+  "vmin.f32   q14, q14, q11               \n"    \
+  "vmul.f32   q6,  q13, q12               \n"    \
+  "vmul.f32   q7,  q15, q14               \n"    \
+  "9:\n"
+
+#define SPARSE_INT8_F32_W32_v7_SEMI2_OUT                                       \
+  SPARSE_INT8_F32_W32_v7_SEMI2_KERNEL                                          \
+      SPARSE_INT8_TRANS_INT32_TO_FP32_W32_v7_SEMI2                             \
+          SPARSE_INT8_F32_W32_v7_SEMI2_RELU SPARSE_INT8_F32_W32_v7_SEMI2_RELU6 \
+              SPARSE_INT8_F32_W32_v7_SEMI2_LEAKY_RELU                          \
+                  SPARSE_INT8_F32_W32_v7_SEMI2_HARD_SWISH                      \
+      "mov   r0,   %[c_ptr]\n" /* store result */                              \
+      "vst1.32   {d8-d11},  [r0]\n"                                            \
+      "add  r0,  r0,   #32\n"                                                  \
+      "vst1.32   {d12-d15},  [r0]\n"                                           \
+      "add  r0,  r0,   #32\n"                                                  \
+      "vst1.32   {d0-d3},  [r0]\n"                                             \
+      "add  r0,  r0,   #32\n"                                                  \
+      "vst1.32   {d4-d7},  [r0]\n"
+
+#define SPARSE_INT8_F32_W16_v7_SEMI2_KERNEL \
+  "veor   q4,  q0,  q0\n"                   \
+  "pld  [%[a_ptr], #32]    \n"              \
+  "veor   q5,  q1,  q1\n"                   \
+  "pld  [%[widx_dmap], #32]    \n"          \
+  "veor   q6,  q2,  q2\n"                   \
+  "pld  [%[b_ptr], #32]    \n"              \
+  "veor   q7,  q3,  q3\n"                   \
+  "cmp    %[k],    #0\n"                    \
+  "beq    1f\n" /* main loop*/              \
+  "0:\n"                                    \
+  "ldrsb   r0, [%[a_ptr]]\n"                \
+  "subs    %[k],   %[k],   #1\n"            \
+  "vld1.8  {d2-d3}, [%[b_ptr]]\n"           \
+  "add   %[a_ptr],  %[a_ptr], %[vdif]\n"    \
+  "vdup.8    d0,   r0\n"                    \
+  "vmull.s8    q2,    d2,  d0\n"            \
+  "vmull.s8    q3,    d3,  d0\n"            \
+  "ldr     r1, [%[widx_dmap]],   #4\n"      \
+  "add   %[b_ptr],  %[b_ptr], r1\n"         \
+  "vaddw.s16   q4,    q4,  d4\n"            \
+  "vaddw.s16   q5,    q5,  d5\n"            \
+  "pld  [%[b_ptr], #32]    \n"              \
+  "vaddw.s16   q6,    q6,  d6\n"            \
+  "vaddw.s16   q7,    q7,  d7\n"            \
+  "bne     0b\n"                            \
+  "1:\n"
+
+#define SPARSE_INT8_TRANS_INT32_TO_FP32_W16_v7_SEMI2 \
+  /* write output */                                 \
+  "vcvt.f32.s32   q0, q4\n" /* cvt int32 to fp32*/   \
+  "vcvt.f32.s32   q1, q5\n" /* cvt int32 to fp32*/   \
+  "vcvt.f32.s32   q2, q6\n" /* cvt int32 to fp32*/   \
+  "vcvt.f32.s32   q3, q7\n" /* cvt int32 to fp32*/   \
+  "vdup.32    q4,   %[vscale]\n"                     \
+  "vdup.32    q8,   %[vbias]\n"                      \
+  "vdup.32    q9,   d16[0]\n"                        \
+  "vdup.32    q10,  d16[0]\n"                        \
+  "vdup.32    q11,  d16[0]\n"                        \
+  "3:\n"                                             \
+  "vmla.f32  q8,   q0,  q4\n"                        \
+  "vmla.f32  q9,   q1,  q4\n"                        \
+  "vmla.f32  q10,  q2,  q4\n"                        \
+  "vmla.f32  q11,  q3,  q4\n"
+
+#define SPARSE_INT8_F32_W16_v7_SEMI2_RELU             \
+  /* do relu */                                       \
+  "cmp    %[vflag_act],    #0\n"     /* skip relu */  \
+  "beq   9f                     \n"  /* no act end */ \
+  "cmp    %[vflag_act],    #1\n"     /* skip relu */  \
+  "bne   10f                     \n" /* other act */  \
+  "vmov.i32   q0, #0\n"              /* for relu */   \
+  "vmax.f32   q8,   q8,   q0\n"      /* relu */       \
+  "vmax.f32   q9,   q9,   q0\n"      /* relu */       \
+  "vmax.f32   q10,  q10,  q0\n"      /* relu */       \
+  "vmax.f32   q11,  q11,  q0\n"      /* relu */       \
+  "b      9f                    \n"  /* relu end */
+
+#define SPARSE_INT8_F32_W16_v7_SEMI2_RELU6             \
+  /* do relu6 */                                       \
+  "10: \n"                                             \
+  "cmp   %[vflag_act],  #2       \n" /* check relu6 */ \
+  "bne   11f                     \n" /* no act end */  \
+  "vmov.i32   q0,   #0\n"            /* for relu6 */   \
+  "vdup.32    q1,   %[valpha]\n"     /* relu6 alpha */ \
+  "vmax.f32   q8,   q8,   q0\n"      /* relu6 */       \
+  "vmax.f32   q9,   q9,   q0\n"      /* relu6 */       \
+  "vmax.f32   q10,  q10,  q0\n"      /* relu6 */       \
+  "vmax.f32   q11,  q11,  q0\n"      /* relu6 */       \
+  "vmin.f32   q8,   q8,   q1\n"      /* relu6 */       \
+  "vmin.f32   q9,   q9,   q1\n"      /* relu6 */       \
+  "vmin.f32   q10,  q10,  q1\n"      /* relu6 */       \
+  "vmin.f32   q11,  q11,  q1\n"      /* relu6 */       \
+  "b      9f                    \n"  /* relu end */
+
+#define SPARSE_INT8_F32_W16_v7_SEMI2_LEAKY_RELU              \
+  /* do relu */                                              \
+  "11: \n"                                                   \
+  "cmp   %[vflag_act],  #3       \n" /* check leakey relu */ \
+  "bne   12f                     \n" /* no act end */        \
+  "vmov.i32   q0, #0\n"              /* for relu */          \
+  "vdup.32    q1,  %[valpha]\n"      /* leakey relu alpha */ \
+  "vcge.f32   q2,    q8,    q0   \n" /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q8,    q1   \n" /* vmulq_f32 */         \
+  "vcge.f32   q4,    q9,    q0   \n" /* vcgeq_f32 */         \
+  "vmul.f32   q5,    q9,    q1   \n" /* vmulq_f32 */         \
+  "vcge.f32   q6,    q10,   q0   \n" /* vcgeq_f32 */         \
+  "vmul.f32   q7,    q10,   q1   \n" /* vmulq_f32 */         \
+  "vbif       q8,    q3,    q2   \n"                         \
+  "vbif       q9,    q5,    q4   \n"                         \
+  "vbif       q10,   q7,    q6   \n"                         \
+  "vcge.f32   q2,    q11,    q0   \n" /* vcgeq_f32 */        \
+  "vmul.f32   q3,    q11,    q1   \n" /* vmulq_f32 */        \
+  "vbif       q11,   q3,    q2   \n"                         \
+  "b      9f                    \n"
+
+#define SPARSE_INT8_F32_W16_v7_SEMI2_HARD_SWISH \
+  "12: \n"                                      \
+  "vmov.u32   q0,   #0                     \n"  \
+  "vld1.f32   {d2-d5}, [%[hs_param]]!      \n"  \
+  "vld1.f32   {d6-d7}, [%[hs_param]]       \n"  \
+  "vadd.f32   q4, q8, q1                \n"     \
+  "vadd.f32   q6, q9, q1                \n"     \
+  "vmul.f32   q5, q8, q2                \n"     \
+  "vmul.f32   q7, q9, q2                \n"     \
+  "vmax.f32   q4, q4, q0                \n"     \
+  "vmax.f32   q6, q6, q0                \n"     \
+  "vmin.f32   q4, q4, q3                \n"     \
+  "vmin.f32   q6, q6, q3                \n"     \
+  "vmul.f32   q8, q5, q4                \n"     \
+  "vmul.f32   q9, q7, q6                \n"     \
+  "vadd.f32   q4, q10, q1                \n"    \
+  "vadd.f32   q6, q11, q1                \n"    \
+  "vmul.f32   q5, q10, q2                \n"    \
+  "vmul.f32   q7, q11, q2                \n"    \
+  "vmax.f32   q4, q4, q0                \n"     \
+  "vmax.f32   q6, q6, q0                \n"     \
+  "vmin.f32   q4, q4, q3                \n"     \
+  "vmin.f32   q6, q6, q3                \n"     \
+  "vmul.f32   q10, q5, q4                \n"    \
+  "vmul.f32   q11, q7, q6                \n"    \
+  "9:\n"
+
+#define SPARSE_INT8_F32_W16_v7_SEMI2_OUT                                       \
+  SPARSE_INT8_F32_W16_v7_SEMI2_KERNEL                                          \
+      SPARSE_INT8_TRANS_INT32_TO_FP32_W16_v7_SEMI2                             \
+          SPARSE_INT8_F32_W16_v7_SEMI2_RELU SPARSE_INT8_F32_W16_v7_SEMI2_RELU6 \
+              SPARSE_INT8_F32_W16_v7_SEMI2_LEAKY_RELU                          \
+                  SPARSE_INT8_F32_W16_v7_SEMI2_HARD_SWISH                      \
+      "mov   r0,   %[c_ptr]\n" /* store result */                              \
+      "vst1.32   {d16-d19},  [r0]\n"                                           \
+      "add  r0,  r0,   #32\n"                                                  \
+      "vst1.32   {d20-d23},  [r0]\n"
+
+#define SPARSE_INT8_F32_W8_v7_SEMI2_KERNEL \
+  "veor   q4,  q0,  q0\n"                  \
+  "veor   q5,  q1,  q1\n"                  \
+  "cmp    %[k],    #0\n"                   \
+  "beq    1f\n" /* main loop*/             \
+  "0:\n"                                   \
+  "ldrsb   r0, [%[a_ptr]]\n"               \
+  "subs    %[k],   %[k],   #1\n"           \
+  "add   %[a_ptr],  %[a_ptr], %[vdif]\n"   \
+  "vld1.8  d2, [%[b_ptr]]\n"               \
+  "vdup.8    d0,   r0\n"                   \
+  "vmull.s8    q2,    d2,  d0\n"           \
+  "ldr     r1, [%[widx_dmap]],   #4\n"     \
+  "add   %[b_ptr],  %[b_ptr], r1\n"        \
+  "vaddw.s16   q4,    q4,  d4\n"           \
+  "vaddw.s16   q5,    q5,  d5\n"           \
+  "bne     0b\n"                           \
+  "1:\n"
+
+#define SPARSE_INT8_TRANS_INT32_TO_FP32_W8_v7_SEMI2 \
+  /* write output */                                \
+  "vcvt.f32.s32   q0, q4\n" /* cvt int32 to fp32*/  \
+  "vcvt.f32.s32   q1, q5\n" /* cvt int32 to fp32*/  \
+  "vdup.32    q2,   %[vscale]\n"                    \
+  "vdup.32    q6,   %[vbias]\n"                     \
+  "vdup.32    q7,   d12[0]\n"                       \
+  "3:\n"                                            \
+  "vmla.f32  q6,   q0,  q2\n"                       \
+  "vmla.f32  q7,   q1,  q2\n"
+
+#define SPARSE_INT8_F32_W8_v7_SEMI2_RELU              \
+  /* do relu */                                       \
+  "cmp    %[vflag_act],    #0\n"     /* skip relu */  \
+  "beq   9f                     \n"  /* no act end */ \
+  "cmp    %[vflag_act],    #1\n"     /* skip relu */  \
+  "bne   10f                     \n" /* other act */  \
+  "vmov.i32   q0, #0\n"              /* for relu */   \
+  "vmax.f32   q6,   q6,   q0\n"      /* relu */       \
+  "vmax.f32   q7,   q7,   q0\n"      /* relu */       \
+  "b      9f                    \n"  /* relu end */
+
+#define SPARSE_INT8_F32_W8_v7_SEMI2_RELU6              \
+  /* do relu6 */                                       \
+  "10: \n"                                             \
+  "cmp   %[vflag_act],  #2       \n" /* check relu6 */ \
+  "bne   11f                     \n" /* no act end */  \
+  "vmov.i32   q0,   #0\n"            /* for relu6 */   \
+  "vdup.32    q1,   %[valpha]\n"     /* relu6 alpha */ \
+  "vmax.f32   q6,   q6,   q0\n"      /* relu6 */       \
+  "vmax.f32   q7,   q7,   q0\n"      /* relu6 */       \
+  "vmin.f32   q6,   q6,   q1\n"      /* relu6 */       \
+  "vmin.f32   q7,   q7,   q1\n"      /* relu6 */       \
+  "b      9f                    \n"  /* relu end */
+
+#define SPARSE_INT8_F32_W8_v7_SEMI2_LEAKY_RELU               \
+  /* do relu */                                              \
+  "11: \n"                                                   \
+  "cmp   %[vflag_act],  #3       \n" /* check leakey relu */ \
+  "bne   12f                     \n" /* no act end */        \
+  "vmov.i32   q0, #0\n"              /* for relu */          \
+  "vdup.32    q1,  %[valpha]\n"      /* leakey relu alpha */ \
+  "vcge.f32   q2,    q6,    q0   \n" /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q6,    q1   \n" /* vmulq_f32 */         \
+  "vcge.f32   q4,    q7,    q0   \n" /* vcgeq_f32 */         \
+  "vmul.f32   q5,    q7,    q1   \n" /* vmulq_f32 */         \
+  "vbif       q6,    q3,    q2   \n" /* vmulq_f32 */         \
+  "vbif       q7,    q5,    q4   \n" /* vmulq_f32 */         \
+  "b      9f                    \n"
+
+#define SPARSE_INT8_F32_W8_v7_SEMI2_HARD_SWISH \
+  "12: \n"                                     \
+  "vmov.u32   q0,   #0                     \n" \
+  "vld1.f32   {d2-d5}, [%[hs_param]]!      \n" \
+  "vld1.f32   {d6-d7}, [%[hs_param]]       \n" \
+  "vadd.f32   q4, q6, q1                \n"    \
+  "vadd.f32   q8, q7, q1                \n"    \
+  "vmul.f32   q5, q6, q2                \n"    \
+  "vmul.f32   q9, q7, q2                \n"    \
+  "vmax.f32   q4, q4, q0                \n"    \
+  "vmax.f32   q8, q8, q0                \n"    \
+  "vmin.f32   q4, q4, q3                \n"    \
+  "vmin.f32   q8, q8, q3                \n"    \
+  "vmul.f32   q6, q5, q4                \n"    \
+  "vmul.f32   q7, q9, q8                \n"    \
+  "9:\n"
+
+#define SPARSE_INT8_F32_W8_v7_SEMI2_OUT                                      \
+  SPARSE_INT8_F32_W8_v7_SEMI2_KERNEL                                         \
+      SPARSE_INT8_TRANS_INT32_TO_FP32_W8_v7_SEMI2                            \
+          SPARSE_INT8_F32_W8_v7_SEMI2_RELU SPARSE_INT8_F32_W8_v7_SEMI2_RELU6 \
+              SPARSE_INT8_F32_W8_v7_SEMI2_LEAKY_RELU                         \
+                  SPARSE_INT8_F32_W8_v7_SEMI2_HARD_SWISH /* store result */  \
+      "vst1.32   {d12-d15},  [%[c_ptr]]\n"
+
+#define SPARSE_INT8_F32_W4_v7_SEMI2_KERNEL \
+  "veor   q3,  q0,  q0\n"                  \
+  "cmp    %[k],    #0\n"                   \
+  "beq    1f\n" /* main loop*/             \
+  "0:\n"                                   \
+  "ldrsb   r0, [%[a_ptr]]\n"               \
+  "vdup.8     d0,    r0\n"                 \
+  "subs    %[k],   %[k],   #1\n"           \
+  "ldr     r0, [%[b_ptr]]\n"               \
+  "add   %[a_ptr],  %[a_ptr], %[vdif]\n"   \
+  "vdup.32    d2,   r0\n"                  \
+  "vmull.s8    q2,    d2,  d0\n"           \
+  "ldr     r1, [%[widx_dmap]],   #4\n"     \
+  "add   %[b_ptr],  %[b_ptr], r1\n"        \
+  "vaddw.s16   q3,    q3,  d4\n"           \
+  "bne     0b\n"                           \
+  "1:\n"
+
+#define SPARSE_INT8_TRANS_INT32_TO_FP32_W4_v7_SEMI2 \
+  /* write output */                                \
+  "vcvt.f32.s32   q0, q3\n" /* cvt int32 to fp32*/  \
+  "vdup.32    q1,   %[vscale]\n"                    \
+  "vdup.32    q4,   %[vbias]\n"                     \
+  "3:\n"                                            \
+  "vmla.f32  q4,   q0,  q1\n"
+
+#define SPARSE_INT8_F32_W4_v7_SEMI2_RELU              \
+  /* do relu */                                       \
+  "cmp    %[vflag_act],    #0\n"     /* skip relu */  \
+  "beq   9f                     \n"  /* no act end */ \
+  "cmp    %[vflag_act],    #1\n"     /* skip relu */  \
+  "bne   10f                     \n" /* other act */  \
+  "vmov.i32   q0, #0\n"              /* for relu */   \
+  "vmax.f32   q4,   q4,   q0\n"      /* relu */       \
+  "b      9f                    \n"  /* relu end */
+
+#define SPARSE_INT8_F32_W4_v7_SEMI2_RELU6              \
+  /* do relu6 */                                       \
+  "10: \n"                                             \
+  "cmp   %[vflag_act],  #2       \n" /* check relu6 */ \
+  "bne   11f                     \n" /* no act end */  \
+  "vmov.i32   q0,   #0\n"            /* for relu6 */   \
+  "vdup.32    q1,   %[valpha]\n"     /* relu6 alpha */ \
+  "vmax.f32   q4,   q4,   q0\n"      /* relu6 */       \
+  "vmin.f32   q4,   q4,   q1\n"      /* relu6 */       \
+  "b      9f                    \n"  /* relu end */
+
+#define SPARSE_INT8_F32_W4_v7_SEMI2_LEAKY_RELU               \
+  /* do relu */                                              \
+  "11: \n"                                                   \
+  "cmp   %[vflag_act],  #3       \n" /* check leakey relu */ \
+  "bne   12f                     \n" /* no act end */        \
+  "vmov.i32   q0, #0\n"              /* for relu */          \
+  "vdup.32    q1,  %[valpha]\n"      /* leakey relu alpha */ \
+  "vcge.f32   q2,    q4,    q0   \n" /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q4,    q1   \n" /* vmulq_f32 */         \
+  "vbif       q4,    q3,    q2   \n"                         \
+  "b      9f                    \n"
+
+#define SPARSE_INT8_F32_W4_v7_SEMI2_HARD_SWISH \
+  "12: \n"                                     \
+  "vmov.u32   q0,   #0                     \n" \
+  "vld1.f32   {d2-d5}, [%[hs_param]]!      \n" \
+  "vld1.f32   {d6-d7}, [%[hs_param]]       \n" \
+  "vadd.f32   q5, q4, q1                \n"    \
+  "vmul.f32   q6, q4, q2                \n"    \
+  "vmax.f32   q5, q5, q0                \n"    \
+  "vmin.f32   q5, q5, q3                \n"    \
+  "vmul.f32   q4, q6, q5                \n"    \
+  "9:\n"
+
+#define SPARSE_INT8_F32_W4_v7_SEMI2_OUT                                      \
+  SPARSE_INT8_F32_W4_v7_SEMI2_KERNEL                                         \
+      SPARSE_INT8_TRANS_INT32_TO_FP32_W4_v7_SEMI2                            \
+          SPARSE_INT8_F32_W4_v7_SEMI2_RELU SPARSE_INT8_F32_W4_v7_SEMI2_RELU6 \
+              SPARSE_INT8_F32_W4_v7_SEMI2_LEAKY_RELU                         \
+                  SPARSE_INT8_F32_W4_v7_SEMI2_HARD_SWISH /* store result */  \
+      "vst1.32   {d8-d9},  [%[c_ptr]]\n"
+
+#define GET_SEMI_INT8_V7_PARAM_TABLE(Dtype_i, Dtype_o)                         \
+  Dtype_o* out_ptr1 =                                                          \
+      reinterpret_cast<Dtype_o*>((uintptr_t)output + output_stride * 2 * i);   \
+  Dtype_o* out_ptr2 = reinterpret_cast<Dtype_o*>((uintptr_t)output +           \
+                                                 output_stride * (2 * i + 1)); \
+  const Dtype_i* cur_w = A;                                                    \
+  uint32_t nnz = nidx_nnzmap[i];                                               \
+  const Dtype_i* cur_b = B;                                                    \
+  const int32_t* dmap = widx_dmap;                                             \
+  if (i != 0) {                                                                \
+    cur_w = A + nidx_nnzmap[i - 1] * 2;                                        \
+    nnz = nidx_nnzmap[i] - nidx_nnzmap[i - 1];                                 \
+    cur_b += ((nidx_nnzmap[i - 1] == 0)                                        \
+                  ? 0                                                          \
+                  : widx_dmap[nidx_nnzmap[i - 1] - 1] / sizeof(Dtype_i));      \
+    dmap = widx_dmap + nidx_nnzmap[i - 1];                                     \
+  }                                                                            \
+  const Dtype_i* cur_w2 = cur_w + 1;                                           \
+  const Dtype_i* cur_b2 = cur_b;                                               \
+  const int32_t* dmap2 = dmap;                                                 \
+  uint32_t nnz2 = nnz;
+
+#define SET_ASSM_INPUT_PARAM1_1_V7_INT8_F32 \
+: [a_ptr] "+r"(cur_w),                        \
+  [b_ptr] "+r"(cur_b),                        \
+  [c_ptr] "+r"(out_ptr1),                     \
+  [k] "+r"(nnz),                              \
+  [widx_dmap] "+r"(dmap),                     \
+  [hs_param] "+r"(hs_param)                   \
+: [vscale] "r"(vscale),                       \
+  [vbias] "r"(vbias),                         \
+  [vflag_act] "r"(flag_act),                  \
+  [valpha] "r"(alpha),                        \
+  [vdif] "r"(2)
+
+#define SET_ASSM_INPUT_PARAM1_2_V7_INT8_F32 \
+: [a_ptr] "+r"(cur_w2),                       \
+  [b_ptr] "+r"(cur_b2),                       \
+  [c_ptr] "+r"(out_ptr2),                     \
+  [k] "+r"(nnz2),                             \
+  [widx_dmap] "+r"(dmap2),                    \
+  [hs_param] "+r"(hs_param)                   \
+: [vscale] "r"(vscale),                       \
+  [vbias] "r"(vbias),                         \
+  [vflag_act] "r"(flag_act),                  \
+  [valpha] "r"(alpha),                        \
+  [vdif] "r"(2)
+
+#define SET_ASSM_INPUT_PARAM2_V7_INT8_F32 \
+: [a_ptr] "+r"(cur_w),                      \
+  [b_ptr] "+r"(cur_b),                      \
+  [c_ptr] "+r"(out_ptr),                    \
+  [k] "+r"(nnz),                            \
+  [widx_dmap] "+r"(dmap),                   \
+  [hs_param] "+r"(hs_param)                 \
+: [vscale] "r"(vscale),                     \
+  [vbias] "r"(vbias),                       \
+  [vflag_act] "r"(flag_act),                \
+  [valpha] "r"(alpha),                      \
+  [vdif] "r"(2)
+
+/**
+ * \brief Sparse calculation implementation of 1x1 convolution, the input-output
+ * type is int8-f32.
+ * Sparse matrix multiplication is calculated in blocks, the block size is Mx48,
+ * that is,
+ * the parameter matrix is MxK, and the activation matrix is Kx48; when N is
+ * less than 48,
+ * it is calculated in blocks of Mx32, Mx16, Mx8, and Mx4 in turn;
+ * @param A sparse weight data
+ * @param B dense input data
+ * @param widx_dmap An array of int32_t values storing scaled [by sizeof(input
+ * element)] difference
+ * between input channels corresponding to successive non-zero element
+ * @param nidx_nnzmap the number of non-zero kernel elements per each output
+ * channel
+ * @param bias
+ * @param output
+ * @param M
+ * @param N
+ * @param K
+ * @param param
+ * @param ctx
+ */
+void sparse_semi_conv_int8_fp32_pipelined(
+    const int8_t* A,
+    const int8_t* B,
+    const int32_t* widx_dmap,
+    const uint32_t* nidx_nnzmap,
+    const float* bias,
+    const float* scale,
+    float* output,
+    int M,
+    int K,
+    int N,
+    const operators::SparseConvParam& param,
+    ARMContext* ctx) {
+  auto act_param = param.activation_param;
+  auto act_type = act_param.active_type;
+  volatile float alpha = 0.f;
+  float vhs_param[12] = {0.f};
+  int flag_act = 0x00;  // relu: 1, relu6: 2, leakey: 3
+  if (act_param.has_active) {
+    if (act_type == lite_api::ActivationType::kRelu) {
+      flag_act = 0x01;
+    } else if (act_type == lite_api::ActivationType::kRelu6) {
+      flag_act = 0x02;
+      alpha = act_param.Relu_clipped_coef;
+    } else if (act_type == lite_api::ActivationType::kLeakyRelu) {
+      flag_act = 0x03;
+      alpha = act_param.Leaky_relu_alpha;
+    } else if (act_type == lite_api::ActivationType::kHardSwish) {
+      flag_act = 0x04;
+      for (int i = 0; i < 4; i++) {
+        vhs_param[i] = act_param.hard_swish_offset;
+        vhs_param[i + 4] = 1.0 / act_param.hard_swish_scale;
+        vhs_param[i + 8] = act_param.hard_swish_threshold;
+      }
+    }
+  }
+  int flag_bias = (bias != nullptr) ? 1 : 0;
+  size_t mc = N * sizeof(int8_t);
+  size_t nc = M;
+  size_t output_stride = N * sizeof(float);
+  size_t output_decrement = output_stride * nc - 48 * sizeof(float);
+  size_t pair_num = M / 2;
+  size_t lave_num = M % 2;
+  float bias_zero[2] = {0.f, 0.f};
+  while
+    SPARSE_LIKELY(mc >= 48 * sizeof(int8_t)) {
+      LITE_PARALLEL_COMMON_BEGIN(i, tid, pair_num, 0, 1) {
+        GET_SEMI_INT8_V7_PARAM_TABLE(int8_t, float)
+        float vbias = (bias != nullptr) ? bias[i * 2] : 0.0;
+        float vscale = scale[i * 2];
+        float* hs_param = vhs_param;
+        // clang-format off
+        asm volatile(SPARSE_INT8_F32_W48_v7_SEMI2_OUT
+              SET_ASSM_INPUT_PARAM1_1_V7_INT8_F32
+              : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12",
+                "q13", "q14", "q15", "r0", "r1", "cc", "memory");
+        // clang-format on
+        vbias = (bias != nullptr) ? bias[i * 2 + 1] : 0.0;
+        vscale = scale[i * 2 + 1];
+        hs_param = vhs_param;
+        // clang-format off
+        asm volatile(SPARSE_INT8_F32_W48_v7_SEMI2_OUT
+          SET_ASSM_INPUT_PARAM1_2_V7_INT8_F32
+          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12",
+            "q13", "q14", "q15", "r0", "r1", "cc", "memory");
+        // clang-format on
+      }
+      LITE_PARALLEL_COMMON_END();
+      if
+        SPARSE_UNLIKELY(lave_num != 0) {
+          GET_UNSTRUCT_PARAM_TABLE(int8_t, float)
+          float vscale = scale[pair_num * 2];
+          float* hs_param = vhs_param;
+          // clang-format off
+        asm volatile(SPARSE_INT8_F32_W48_v7_SEMI2_OUT
+          SET_ASSM_INPUT_PARAM2_V7_INT8_F32
+          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12",
+            "q13", "q14", "q15", "r0", "r1", "cc", "memory");
+          // clang-format on
+        }
+      output = reinterpret_cast<float*>((uintptr_t)output + 48 * sizeof(float));
+      B += 48;
+      mc -= 48 * sizeof(int8_t);
+    }
+  if
+    SPARSE_UNLIKELY(mc != 0) {
+      output_decrement += 16 * sizeof(float);
+      if (mc & (32 * sizeof(int8_t))) {
+        LITE_PARALLEL_COMMON_BEGIN(i, tid, pair_num, 0, 1) {
+          GET_SEMI_INT8_V7_PARAM_TABLE(int8_t, float)
+          float vbias = (bias != nullptr) ? bias[i * 2] : 0.0;
+          float vscale = scale[i * 2];
+          float* hs_param = vhs_param;
+          // clang-format off
+        asm volatile(SPARSE_INT8_F32_W32_v7_SEMI2_OUT
+              SET_ASSM_INPUT_PARAM1_1_V7_INT8_F32
+              : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15",
+                "r0", "r1", "cc", "memory");
+          // clang-format on
+          vbias = (bias != nullptr) ? bias[i * 2 + 1] : 0.0;
+          vscale = scale[i * 2 + 1];
+          hs_param = vhs_param;
+          // clang-format off
+        asm volatile(SPARSE_INT8_F32_W32_v7_SEMI2_OUT
+          SET_ASSM_INPUT_PARAM1_2_V7_INT8_F32
+          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15",
+            "r0", "r1", "cc", "memory");
+          // clang-format on
+        }
+        LITE_PARALLEL_COMMON_END();
+        if
+          SPARSE_UNLIKELY(lave_num != 0) {
+            GET_UNSTRUCT_PARAM_TABLE(int8_t, float)
+            float vscale = scale[pair_num * 2];
+            float* hs_param = vhs_param;
+            // clang-format off
+        asm volatile(SPARSE_INT8_F32_W32_v7_SEMI2_OUT
+          SET_ASSM_INPUT_PARAM2_V7_INT8_F32
+          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15",
+            "r0", "r1", "cc", "memory");
+            // clang-format on
+          }
+        output =
+            reinterpret_cast<float*>((uintptr_t)output + 32 * sizeof(float));
+        B += 32;
+        mc -= 32 * sizeof(int8_t);
+      }
+      output_decrement += 16 * sizeof(float);
+      if (mc & (16 * sizeof(int8_t))) {
+        LITE_PARALLEL_COMMON_BEGIN(i, tid, pair_num, 0, 1) {
+          GET_SEMI_INT8_V7_PARAM_TABLE(int8_t, float)
+          float vbias = (bias != nullptr) ? bias[i * 2] : 0.0;
+          float vscale = scale[i * 2];
+          float* hs_param = vhs_param;
+          // clang-format off
+        asm volatile(SPARSE_INT8_F32_W16_v7_SEMI2_OUT
+              SET_ASSM_INPUT_PARAM1_1_V7_INT8_F32
+              : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11",
+                "r0", "r1", "cc", "memory");
+          // clang-format on
+          vbias = (bias != nullptr) ? bias[i * 2 + 1] : 0.0;
+          vscale = scale[i * 2 + 1];
+          hs_param = vhs_param;
+          // clang-format off
+        asm volatile(SPARSE_INT8_F32_W16_v7_SEMI2_OUT
+          SET_ASSM_INPUT_PARAM1_2_V7_INT8_F32
+          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11",
+            "r0", "r1", "cc", "memory");
+          // clang-format on
+        }
+        LITE_PARALLEL_COMMON_END();
+        if
+          SPARSE_UNLIKELY(lave_num != 0) {
+            GET_UNSTRUCT_PARAM_TABLE(int8_t, float)
+            float vscale = scale[pair_num * 2];
+            float* hs_param = vhs_param;
+            // clang-format off
+        asm volatile(SPARSE_INT8_F32_W16_v7_SEMI2_OUT
+          SET_ASSM_INPUT_PARAM2_V7_INT8_F32
+          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11",
+            "r0", "r1", "cc", "memory");
+            // clang-format on
+          }
+        output =
+            reinterpret_cast<float*>((uintptr_t)output + 16 * sizeof(float));
+        B += 16;
+        mc -= 16 * sizeof(int8_t);
+      }
+      output_decrement += 8 * sizeof(float);
+      if (mc & (8 * sizeof(int8_t))) {
+        LITE_PARALLEL_COMMON_BEGIN(i, tid, pair_num, 0, 1) {
+          GET_SEMI_INT8_V7_PARAM_TABLE(int8_t, float)
+          float vbias = (bias != nullptr) ? bias[i * 2] : 0.0;
+          float vscale = scale[i * 2];
+          float* hs_param = vhs_param;
+          // clang-format off
+        asm volatile(SPARSE_INT8_F32_W8_v7_SEMI2_OUT
+              SET_ASSM_INPUT_PARAM1_1_V7_INT8_F32
+              : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "r0", "r1", "cc", "memory");
+          // clang-format on
+          vbias = (bias != nullptr) ? bias[i * 2 + 1] : 0.0;
+          vscale = scale[i * 2 + 1];
+          hs_param = vhs_param;
+          // clang-format off
+        asm volatile(SPARSE_INT8_F32_W8_v7_SEMI2_OUT
+          SET_ASSM_INPUT_PARAM1_2_V7_INT8_F32
+          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "r0", "r1", "cc", "memory");
+          // clang-format on
+        }
+        LITE_PARALLEL_COMMON_END();
+        if
+          SPARSE_UNLIKELY(lave_num != 0) {
+            GET_UNSTRUCT_PARAM_TABLE(int8_t, float)
+            float vscale = scale[pair_num * 2];
+            float* hs_param = vhs_param;
+            // clang-format off
+        asm volatile(SPARSE_INT8_F32_W8_v7_SEMI2_OUT
+          SET_ASSM_INPUT_PARAM2_V7_INT8_F32
+          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "r0", "r1", "cc", "memory");
+            // clang-format on
+          }
+        output =
+            reinterpret_cast<float*>((uintptr_t)output + 8 * sizeof(float));
+        B += 8;
+        mc -= 8 * sizeof(int8_t);
+      }
+      output_decrement += 4 * sizeof(float);
+      if (mc & (4 * sizeof(int8_t))) {
+        LITE_PARALLEL_COMMON_BEGIN(i, tid, pair_num, 0, 1) {
+          GET_SEMI_INT8_V7_PARAM_TABLE(int8_t, float)
+          float vbias = (bias != nullptr) ? bias[i * 2] : 0.0;
+          float vscale = scale[i * 2];
+          float* hs_param = vhs_param;
+          // clang-format off
+        asm volatile(SPARSE_INT8_F32_W4_v7_SEMI2_OUT
+              SET_ASSM_INPUT_PARAM1_1_V7_INT8_F32
+              : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "r0", "r1", "cc", "memory");
+          // clang-format on
+          vbias = (bias != nullptr) ? bias[i * 2 + 1] : 0.0;
+          vscale = scale[i * 2 + 1];
+          hs_param = vhs_param;
+          // clang-format off
+        asm volatile(SPARSE_INT8_F32_W4_v7_SEMI2_OUT
+          SET_ASSM_INPUT_PARAM1_2_V7_INT8_F32
+          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "r0", "r1", "cc", "memory");
+          // clang-format on
+        }
+        LITE_PARALLEL_COMMON_END();
+        if
+          SPARSE_UNLIKELY(lave_num != 0) {
+            GET_UNSTRUCT_PARAM_TABLE(int8_t, float)
+            float vscale = scale[pair_num * 2];
+            float* hs_param = vhs_param;
+            // clang-format off
+        asm volatile(SPARSE_INT8_F32_W4_v7_SEMI2_OUT
+          SET_ASSM_INPUT_PARAM2_V7_INT8_F32
+          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "r0", "r1", "cc", "memory");
+            // clang-format on
+          }
+        output =
+            reinterpret_cast<float*>((uintptr_t)output + 4 * sizeof(float));
+        B += 4;
+        mc -= 4 * sizeof(int8_t);
+      }
+
+      if
+        SPARSE_UNLIKELY(mc != 0 && mc < 4 * sizeof(int8_t)) {
+          int mindex = mc / sizeof(int8_t);
+
+          LITE_PARALLEL_COMMON_BEGIN(i, tid, pair_num, 0, 1) {
+            float* out_ptr1 = reinterpret_cast<float*>((uintptr_t)output +
+                                                       output_stride * 2 * i);
+            float* out_ptr2 = reinterpret_cast<float*>(
+                (uintptr_t)output + output_stride * (2 * i + 1));
+            const int8_t* cur_w = A;
+            uint32_t nnz = nidx_nnzmap[i];
+            const int8_t* cur_b = B;
+            const int32_t* dmap = widx_dmap;
+            if (i != 0) {
+              cur_w = A + nidx_nnzmap[i - 1] * 2;
+              nnz = nidx_nnzmap[i] - nidx_nnzmap[i - 1];
+              cur_b +=
+                  ((nidx_nnzmap[i - 1] == 0)
+                       ? 0
+                       : widx_dmap[nidx_nnzmap[i - 1] - 1] / sizeof(int8_t));
+              dmap = widx_dmap + nidx_nnzmap[i - 1];
+            }
+            float32x4_t vbias1, vbias2;
+            if (bias != nullptr) {
+              vbias1 = vdupq_n_f32(bias[i * 2]);
+              vbias2 = vdupq_n_f32(bias[i * 2 + 1]);
+            } else {
+              vbias1 = vdupq_n_f32(0);
+              vbias2 = vdupq_n_f32(0);
+            }
+            const float* pscale = scale + i * 2;
+            int32x4_t vacc0123_1 = vdupq_n_s32(0);
+            int32x4_t vacc0123_2 = vdupq_n_s32(0);
+            for (int j = 0; j < nnz; j++) {
+              int8x8_t vi0123 = vdup_n_s8(0);
+              if (mc == 1) {
+                vi0123 = vld1_lane_s8(cur_b, vi0123, 0);
+              } else if (mc == 2) {
+                vi0123 = vld1_lane_s8(cur_b, vi0123, 0);
+                vi0123 = vld1_lane_s8(cur_b + 1, vi0123, 1);
+              } else {
+                vi0123 = vld1_lane_s8(cur_b, vi0123, 0);
+                vi0123 = vld1_lane_s8(cur_b + 1, vi0123, 1);
+                vi0123 = vld1_lane_s8(cur_b + 2, vi0123, 2);
+              }
+              int8x8_t vw1 = vld1_dup_s8(cur_w);
+              cur_w += 1;
+              int8x8_t vw2 = vld1_dup_s8(cur_w);
+              cur_w += 1;
+              intptr_t diff = *dmap++;
+              cur_b = (const int8_t*)((uintptr_t)cur_b + (uintptr_t)diff);
+              vacc0123_1 =
+                  vaddw_s16(vacc0123_1, vget_low_s16(vmull_s8(vi0123, vw1)));
+              vacc0123_2 =
+                  vaddw_s16(vacc0123_2, vget_low_s16(vmull_s8(vi0123, vw2)));
+            }
+            float32x4_t vaccf0123_1 = vcvtq_f32_s32(vacc0123_1);
+            float32x4_t vaccf0123_2 = vcvtq_f32_s32(vacc0123_2);
+            vaccf0123_1 = vmlaq_n_f32(vbias1, vaccf0123_1, pscale[0]);
+            vaccf0123_2 = vmlaq_n_f32(vbias2, vaccf0123_2, pscale[1]);
+            float32x4_t vzero = vdupq_n_f32(0);
+            if (flag_act == 1) {
+              vaccf0123_1 = vmaxq_f32(vaccf0123_1, vzero);
+              vaccf0123_2 = vmaxq_f32(vaccf0123_2, vzero);
+            } else if (flag_act == 0) {
+            } else if (flag_act == 2) {
+              float32x4_t aph = vdupq_n_f32(alpha);
+              vaccf0123_1 = vmaxq_f32(vaccf0123_1, vzero);
+              vaccf0123_1 = vminq_f32(vaccf0123_1, aph);
+              vaccf0123_2 = vmaxq_f32(vaccf0123_2, vzero);
+              vaccf0123_2 = vminq_f32(vaccf0123_2, aph);
+            } else if (flag_act == 3) {
+              float32x4_t aph = vdupq_n_f32(alpha);
+              uint32x4_t vflag0123_1 = vcgeq_f32(vaccf0123_1, vzero);
+              float32x4_t v0123_1 = vmulq_f32(vaccf0123_1, aph);
+              vaccf0123_1 = vbslq_f32(vflag0123_1, vaccf0123_1, v0123_1);
+              uint32x4_t vflag0123_2 = vcgeq_f32(vaccf0123_2, vzero);
+              float32x4_t v0123_2 = vmulq_f32(vaccf0123_2, aph);
+              vaccf0123_2 = vbslq_f32(vflag0123_2, vaccf0123_2, v0123_2);
+            } else {
+              float32x4_t offset = vdupq_n_f32(act_param.hard_swish_offset);
+              float32x4_t hs_scale =
+                  vdupq_n_f32(1.0 / act_param.hard_swish_scale);
+              float32x4_t thre = vdupq_n_f32(act_param.hard_swish_threshold);
+              float32x4_t vset0123_1 = vaddq_f32(vaccf0123_1, offset);
+              float32x4_t vscale0123_1 = vmulq_f32(vaccf0123_1, hs_scale);
+              vset0123_1 = vmaxq_f32(vset0123_1, vzero);
+              vset0123_1 = vminq_f32(vset0123_1, thre);
+              vaccf0123_1 = vmulq_f32(vscale0123_1, vset0123_1);
+              float32x4_t vset0123_2 = vaddq_f32(vaccf0123_2, offset);
+              float32x4_t vscale0123_2 = vmulq_f32(vaccf0123_2, hs_scale);
+              vset0123_2 = vmaxq_f32(vset0123_2, vzero);
+              vset0123_2 = vminq_f32(vset0123_2, thre);
+              vaccf0123_2 = vmulq_f32(vscale0123_2, vset0123_2);
+            }
+
+            if (mc == 1) {
+              vst1q_lane_f32(out_ptr1, vaccf0123_1, 0);
+              vst1q_lane_f32(out_ptr2, vaccf0123_2, 0);
+            } else if (mc == 2) {
+              vst1q_lane_f32(out_ptr1, vaccf0123_1, 0);
+              vst1q_lane_f32(out_ptr2, vaccf0123_2, 0);
+              vst1q_lane_f32(out_ptr1 + 1, vaccf0123_1, 1);
+              vst1q_lane_f32(out_ptr2 + 1, vaccf0123_2, 1);
+            } else {
+              vst1q_lane_f32(out_ptr1, vaccf0123_1, 0);
+              vst1q_lane_f32(out_ptr2, vaccf0123_2, 0);
+              vst1q_lane_f32(out_ptr1 + 1, vaccf0123_1, 1);
+              vst1q_lane_f32(out_ptr2 + 1, vaccf0123_2, 1);
+              vst1q_lane_f32(out_ptr1 + 2, vaccf0123_1, 2);
+              vst1q_lane_f32(out_ptr2 + 2, vaccf0123_2, 2);
+            }
+          }
+          LITE_PARALLEL_COMMON_END();
+          if
+            SPARSE_UNLIKELY(lave_num != 0) {
+              float* out_ptr = reinterpret_cast<float*>(
+                  (uintptr_t)output + output_stride * 2 * pair_num);
+              const int8_t* cur_w = A;
+              uint32_t nnz = nidx_nnzmap[pair_num];
+              const int8_t* cur_b = B;
+              const int32_t* dmap = widx_dmap;
+              if (pair_num != 0) {
+                cur_w = A + nidx_nnzmap[pair_num - 1] * 2;
+                nnz = nidx_nnzmap[pair_num] - nidx_nnzmap[pair_num - 1];
+                cur_b += ((nidx_nnzmap[pair_num - 1] == 0)
+                              ? 0
+                              : widx_dmap[nidx_nnzmap[pair_num - 1] - 1] /
+                                    sizeof(int8_t));
+                dmap = widx_dmap + nidx_nnzmap[pair_num - 1];
+              }
+              float32x4_t vbias = (bias != nullptr)
+                                      ? vdupq_n_f32(bias[pair_num * 2])
+                                      : vdupq_n_f32(0);
+              float vscale = scale[pair_num * 2];
+              int32x4_t vacc0123 = vdupq_n_s32(0);
+
+              for (int j = 0; j < nnz; j++) {
+                int8x8_t vi0123 = vdup_n_s8(0);
+                if (mc == 1) {
+                  vi0123 = vld1_lane_s8(cur_b, vi0123, 0);
+                } else if (mc == 2) {
+                  vi0123 = vld1_lane_s8(cur_b, vi0123, 0);
+                  vi0123 = vld1_lane_s8(cur_b + 1, vi0123, 1);
+                } else {
+                  vi0123 = vld1_lane_s8(cur_b, vi0123, 0);
+                  vi0123 = vld1_lane_s8(cur_b + 1, vi0123, 1);
+                  vi0123 = vld1_lane_s8(cur_b + 2, vi0123, 2);
+                }
+                int8x8_t vw = vld1_dup_s8(cur_w);
+                cur_w += 1;
+                intptr_t diff = *dmap++;
+                cur_b = (const int8_t*)((uintptr_t)cur_b + (uintptr_t)diff);
+                int16x8_t vo0123 = vmull_s8(vi0123, vw);
+                vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vo0123));
+              }
+              float32x4_t vaccf0123 = vcvtq_f32_s32(vacc0123);
+              vaccf0123 = vmlaq_n_f32(vbias, vaccf0123, vscale);
+              float32x4_t vzero = vdupq_n_f32(0);
+              if (flag_act == 1) {
+                vaccf0123 = vmaxq_f32(vaccf0123, vzero);
+              } else if (flag_act == 0) {
+              } else if (flag_act == 2) {
+                float32x4_t aph = vdupq_n_f32(alpha);
+                vaccf0123 = vmaxq_f32(vaccf0123, vzero);
+                vaccf0123 = vminq_f32(vaccf0123, aph);
+              } else if (flag_act == 3) {
+                float32x4_t aph = vdupq_n_f32(alpha);
+                uint32x4_t vflag0123 = vcgeq_f32(vaccf0123, vzero);
+                float32x4_t v0123 = vmulq_f32(vaccf0123, aph);
+                vaccf0123 = vbslq_f32(vflag0123, vaccf0123, v0123);
+              } else {
+                float32x4_t offset = vdupq_n_f32(act_param.hard_swish_offset);
+                float32x4_t hs_scale =
+                    vdupq_n_f32(1.0 / act_param.hard_swish_scale);
+                float32x4_t thre = vdupq_n_f32(act_param.hard_swish_threshold);
+                float32x4_t vset0123 = vaddq_f32(vaccf0123, offset);
+                float32x4_t vscale0123 = vmulq_f32(vaccf0123, hs_scale);
+                vset0123 = vmaxq_f32(vset0123, vzero);
+                vset0123 = vminq_f32(vset0123, thre);
+                vaccf0123 = vmulq_f32(vscale0123, vset0123);
+              }
+
+              if (mc == 1) {
+                vst1q_lane_f32(out_ptr, vaccf0123, 0);
+              } else if (mc == 2) {
+                vst1q_lane_f32(out_ptr, vaccf0123, 0);
+                vst1q_lane_f32(out_ptr + 1, vaccf0123, 1);
+              } else {
+                vst1q_lane_f32(out_ptr, vaccf0123, 0);
+                vst1q_lane_f32(out_ptr + 1, vaccf0123, 1);
+                vst1q_lane_f32(out_ptr + 2, vaccf0123, 2);
+              }
+            }
+        }
+    }
+}
+
+#define SPARSE_INT8_INT8_W48_v7_SEMI2_KERNEL \
+  "veor   q4,   q0,  q0\n"                   \
+  "veor   q5,   q1,  q1\n"                   \
+  "veor   q6,   q2,  q2\n"                   \
+  "veor   q7,   q3,  q3\n"                   \
+  "pld  [%[a_ptr], #32]    \n"               \
+  "veor   q8,   q0,  q0\n"                   \
+  "veor   q9,   q1,  q1\n"                   \
+  "pld  [%[widx_dmap], #32]    \n"           \
+  "veor   q10,  q2,  q2\n"                   \
+  "veor   q11,  q3,  q3\n"                   \
+  "veor   q12,  q0,  q0\n"                   \
+  "pld  [%[b_ptr], #64]    \n"               \
+  "veor   q13,  q1,  q1\n"                   \
+  "veor   q14,  q2,  q2\n"                   \
+  "veor   q15,  q3,  q3\n"                   \
+  "cmp    %[k],    #0\n"                     \
+  "beq    1f\n" /* main loop*/               \
+  "0:\n"                                     \
+  "ldrsb   r0, [%[a_ptr]]\n"                 \
+  "subs    %[k],   %[k],   #1\n"             \
+  "mov   r1,   %[b_ptr]\n"                   \
+  "add   %[a_ptr],  %[a_ptr], %[vdif]\n"     \
+  "vld1.8  {d2-d5}, [r1]\n"                  \
+  "vdup.8    d0,   r0\n"                     \
+  "vmull.s8    q3,    d2,  d0\n"             \
+  "vaddw.s16   q4,    q4,  d6\n"             \
+  "vaddw.s16   q5,    q5,  d7\n"             \
+  "vmull.s8    q3,    d3,  d0\n"             \
+  "pld  [%[widx_dmap], #32]    \n"           \
+  "vaddw.s16   q6,    q6,  d6\n"             \
+  "vaddw.s16   q7,    q7,  d7\n"             \
+  "vmull.s8    q3,    d4,  d0\n"             \
+  "vaddw.s16   q8,    q8, d6\n"              \
+  "vaddw.s16   q9,    q9, d7\n"              \
+  "vmull.s8    q3,    d5,  d0\n"             \
+  "ldr     r0, [%[widx_dmap]],   #4\n"       \
+  "add   %[b_ptr],  %[b_ptr], r0\n"          \
+  "vaddw.s16   q10,   q10, d6\n"             \
+  "vaddw.s16   q11,   q11, d7\n"             \
+  "pld  [%[b_ptr], #64]    \n"               \
+  "add  r1,  r1,   #32\n"                    \
+  "vld1.8  {d2-d3}, [r1]\n"                  \
+  "vmull.s8    q3,    d2,  d0\n"             \
+  "vaddw.s16   q12,   q12,  d6\n"            \
+  "pld  [%[a_ptr], #32]    \n"               \
+  "vaddw.s16   q13,   q13,  d7\n"            \
+  "vmull.s8    q3,    d3,  d0\n"             \
+  "vaddw.s16   q14,   q14,  d6\n"            \
+  "vaddw.s16   q15,   q15,  d7\n"            \
+  "bne     0b\n"                             \
+  "1:\n"
+
+#define SPARSE_INT8_TRANS_INT32_TO_INT8_W48_v7_SEMI2 \
+  /* write output */                                 \
+  "vld1.32  {d2-d3}, [%[bias_scale]] \n"             \
+  "add %[bias_scale], #16 \n"                        \
+  "vld1.32  {d0-d1}, [%[bias_scale]]    \n"          \
+  "vcvt.f32.s32   q2, q4\n" /* cvt int32 to fp32*/   \
+  "vcvt.f32.s32   q3, q5\n" /* cvt int32 to fp32*/   \
+  "vdup.32    q4,   d2[0]\n"                         \
+  "vdup.32    q5,   d2[0]\n"                         \
+  "vmla.f32   q4,  q2,  q0\n"                        \
+  "vmla.f32   q5,  q3,  q0\n"                        \
+  "vcvt.f32.s32   q2, q6\n" /* cvt int32 to fp32*/   \
+  "vcvt.f32.s32   q3, q7\n" /* cvt int32 to fp32*/   \
+  "vdup.32    q6,   d2[0]\n"                         \
+  "vdup.32    q7,   d2[0]\n"                         \
+  "vmla.f32   q6,  q2,  q0\n"                        \
+  "vmla.f32   q7,  q3,  q0\n"                        \
+  "vcvt.f32.s32   q2, q8\n" /* cvt int32 to fp32*/   \
+  "vcvt.f32.s32   q3, q9\n" /* cvt int32 to fp32*/   \
+  "vdup.32    q8,   d2[0]\n"                         \
+  "vdup.32    q9,   d2[0]\n"                         \
+  "vmla.f32   q8,  q2,  q0\n"                        \
+  "vmla.f32   q9,  q3,  q0\n"                        \
+  "vcvt.f32.s32   q2, q10\n" /* cvt int32 to fp32*/  \
+  "vcvt.f32.s32   q3, q11\n" /* cvt int32 to fp32*/  \
+  "vdup.32    q10,   d2[0]\n"                        \
+  "vdup.32    q11,   d2[0]\n"                        \
+  "vmla.f32   q10,  q2,  q0\n"                       \
+  "vmla.f32   q11,  q3,  q0\n"                       \
+  "vcvt.f32.s32   q2, q12\n" /* cvt int32 to fp32*/  \
+  "vcvt.f32.s32   q3, q13\n" /* cvt int32 to fp32*/  \
+  "vdup.32    q12,   d2[0]\n"                        \
+  "vdup.32    q13,   d2[0]\n"                        \
+  "vmla.f32   q12,  q2,  q0\n"                       \
+  "vmla.f32   q13,  q3,  q0\n"                       \
+  "vcvt.f32.s32   q2, q14\n" /* cvt int32 to fp32*/  \
+  "vcvt.f32.s32   q3, q15\n" /* cvt int32 to fp32*/  \
+  "vdup.32    q14,   d2[0]\n"                        \
+  "vdup.32    q15,   d2[0]\n"                        \
+  "vmla.f32   q14,  q2,  q0\n"                       \
+  "vmla.f32   q15,  q3,  q0\n"
+
+#define SPARSE_INT8_INT8_W48_v7_SEMI2_RELU            \
+  /* do relu */                                       \
+  "cmp    %[vflag_act],    #0\n"     /* skip relu */  \
+  "beq   9f                     \n"  /* no act end */ \
+  "cmp    %[vflag_act],    #1\n"     /* skip relu */  \
+  "bne   10f                     \n" /* other act */  \
+  "vmov.i32   q0, #0\n"              /* for relu */   \
+  "vmax.f32   q4,   q4,   q0\n"      /* relu */       \
+  "vmax.f32   q5,   q5,   q0\n"      /* relu */       \
+  "vmax.f32   q6,   q6,   q0\n"      /* relu */       \
+  "vmax.f32   q7,   q7,   q0\n"      /* relu */       \
+  "vmax.f32   q8,   q8,   q0\n"      /* relu */       \
+  "vmax.f32   q9,   q9,   q0\n"      /* relu */       \
+  "vmax.f32   q10,  q10,  q0\n"      /* relu */       \
+  "vmax.f32   q11,  q11,  q0\n"      /* relu */       \
+  "vmax.f32   q12,  q12,  q0\n"      /* relu */       \
+  "vmax.f32   q13,  q13,  q0\n"      /* relu */       \
+  "vmax.f32   q14,  q14,  q0\n"      /* relu */       \
+  "vmax.f32   q15,  q15,  q0\n"      /* relu */       \
+  "b      9f                    \n"  /* relu end */
+
+#define SPARSE_INT8_INT8_W48_v7_SEMI2_RELU6            \
+  /* do relu6 */                                       \
+  "10: \n"                                             \
+  "cmp   %[vflag_act],  #2       \n" /* check relu6 */ \
+  "bne   11f                     \n" /* no act end */  \
+  "vmov.i32   q0,   #0\n"            /* for relu6 */   \
+  "vdup.32    q1,   %[valpha]\n"     /* relu6 alpha */ \
+  "vmax.f32   q4,   q4,   q0\n"      /* relu6 */       \
+  "vmax.f32   q5,   q5,   q0\n"      /* relu6 */       \
+  "vmax.f32   q6,   q6,   q0\n"      /* relu6 */       \
+  "vmax.f32   q7,   q7,   q0\n"      /* relu6 */       \
+  "vmax.f32   q8,   q8,   q0\n"      /* relu6 */       \
+  "vmax.f32   q9,   q9,   q0\n"      /* relu6 */       \
+  "vmax.f32   q10,  q10,  q0\n"      /* relu6 */       \
+  "vmax.f32   q11,  q11,  q0\n"      /* relu6 */       \
+  "vmax.f32   q12,  q12,  q0\n"      /* relu6 */       \
+  "vmax.f32   q13,  q13,  q0\n"      /* relu6 */       \
+  "vmax.f32   q14,  q14,  q0\n"      /* relu6 */       \
+  "vmax.f32   q15,  q15,  q0\n"      /* relu6 */       \
+  "vmin.f32   q4,   q4,   q1\n"      /* relu6 */       \
+  "vmin.f32   q5,   q5,   q1\n"      /* relu6 */       \
+  "vmin.f32   q6,   q6,   q1\n"      /* relu6 */       \
+  "vmin.f32   q7,   q7,   q1\n"      /* relu6 */       \
+  "vmin.f32   q8,   q8,   q1\n"      /* relu6 */       \
+  "vmin.f32   q9,   q9,   q1\n"      /* relu6 */       \
+  "vmin.f32   q10,  q10,  q1\n"      /* relu6 */       \
+  "vmin.f32   q11,  q11,  q1\n"      /* relu6 */       \
+  "vmin.f32   q12,  q12,  q1\n"      /* relu6 */       \
+  "vmin.f32   q13,  q13,  q1\n"      /* relu6 */       \
+  "vmin.f32   q14,  q14,  q1\n"      /* relu6 */       \
+  "vmin.f32   q15,  q15,  q1\n"      /* relu6 */       \
+  "b      9f                    \n"  /* relu end */
+
+#define SPARSE_INT8_INT8_W48_v7_SEMI2_LEAKY_RELU               \
+  /* do relu */                                                \
+  "11: \n"                                                     \
+  "cmp   %[vflag_act],  #3       \n"   /* check leakey relu */ \
+  "bne   12f                     \n"   /* no act end */        \
+  "vmov.i32   q0, #0\n"                /* for relu */          \
+  "vdup.32    q1,  %[valpha]\n"        /* leakey relu alpha */ \
+  "vcge.f32   q2,    q4,    q0     \n" /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q4,    q1     \n" /* vmulq_f32 */         \
+  "vbif       q4,    q3,    q2     \n"                         \
+  "vcge.f32   q2,    q5,    q0     \n" /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q5,    q1     \n" /* vmulq_f32 */         \
+  "vbif       q5,    q3,    q2     \n"                         \
+  "vcge.f32   q2,    q6,    q0     \n" /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q6,    q1     \n" /* vmulq_f32 */         \
+  "vbif       q6,    q3,    q2     \n"                         \
+  "vcge.f32   q2,    q7,    q0     \n" /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q7,    q1     \n" /* vmulq_f32 */         \
+  "vbif       q7,    q3,    q2     \n"                         \
+  "vcge.f32   q2,    q8,    q0     \n" /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q8,    q1     \n" /* vmulq_f32 */         \
+  "vbif       q8,    q3,    q2     \n"                         \
+  "vcge.f32   q2,    q9,    q0     \n" /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q9,    q1     \n" /* vmulq_f32 */         \
+  "vbif       q9,    q3,    q2     \n"                         \
+  "vcge.f32   q2,    q10,    q0    \n" /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q10,    q1    \n" /* vmulq_f32 */         \
+  "vbif       q10,    q3,    q2    \n"                         \
+  "vcge.f32   q2,    q11,    q0    \n" /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q11,    q1    \n" /* vmulq_f32 */         \
+  "vbif       q11,    q3,    q2    \n"                         \
+  "vcge.f32   q2,    q12,    q0    \n" /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q12,    q1    \n" /* vmulq_f32 */         \
+  "vbif       q12,    q3,    q2    \n"                         \
+  "vcge.f32   q2,    q13,    q0    \n" /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q13,    q1    \n" /* vmulq_f32 */         \
+  "vbif       q13,    q3,    q2    \n"                         \
+  "vcge.f32   q2,    q14,    q0    \n" /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q14,    q1    \n" /* vmulq_f32 */         \
+  "vbif       q14,    q3,    q2    \n"                         \
+  "vcge.f32   q2,    q15,    q0    \n" /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q15,    q1    \n" /* vmulq_f32 */         \
+  "vbif       q15,    q3,    q2    \n"                         \
+  "b      9f                    \n"
+
+#define SPARSE_INT8_INT8_W48_v7_SEMI2_HARD_SWISH                      \
+  /* do relu */                                                       \
+  "12: \n"                                                            \
+  "vld1.f32   {d0-d3}, [%[hs_param]]!      @ load hard swish alpha\n" \
+  "vadd.f32   q3, q4, q0                \n"                           \
+  "vmul.f32   q4, q4, q1                \n"                           \
+  "vmov.u32   q2,   #0                  @ for hardswish \n"           \
+  "vmax.f32   q3, q3, q2                \n"                           \
+  "vld1.f32   {d4-d5}, [%[hs_param]]       \n"                        \
+  "vmin.f32   q3, q3, q2                \n"                           \
+  "vmov.u32   q2,   #0                  @ for hardswish \n"           \
+  "vmul.f32   q4, q4, q3                \n"                           \
+  "vadd.f32   q3, q5, q0                \n"                           \
+  "vmul.f32   q5, q5, q1                \n"                           \
+  "vmax.f32   q3, q3, q2                \n"                           \
+  "vld1.f32   {d4-d5}, [%[hs_param]]       \n"                        \
+  "vmin.f32   q3, q3, q2                \n"                           \
+  "vmov.u32   q2,   #0                  @ for hardswish \n"           \
+  "vmul.f32   q5, q5, q3                \n"                           \
+  "vadd.f32   q3, q6, q0                \n"                           \
+  "vmul.f32   q6, q6, q1                \n"                           \
+  "vmax.f32   q3, q3, q2                \n"                           \
+  "vld1.f32   {d4-d5}, [%[hs_param]]       \n"                        \
+  "vmin.f32   q3, q3, q2                \n"                           \
+  "vmov.u32   q2,   #0                  @ for hardswish \n"           \
+  "vmul.f32   q6, q6, q3                \n"                           \
+  "vadd.f32   q3, q7, q0                \n"                           \
+  "vmul.f32   q7, q7, q1                \n"                           \
+  "vmax.f32   q3, q3, q2                \n"                           \
+  "vld1.f32   {d4-d5}, [%[hs_param]]    \n"                           \
+  "vmin.f32   q3, q3, q2                \n"                           \
+  "vmov.u32   q2,   #0                  @ for hardswish \n"           \
+  "vmul.f32   q7, q7, q3                \n"                           \
+  "vadd.f32   q3, q8, q0                \n"                           \
+  "vmul.f32   q8, q8, q1                \n"                           \
+  "vmax.f32   q3, q3, q2                \n"                           \
+  "vld1.f32   {d4-d5}, [%[hs_param]]    \n"                           \
+  "vmin.f32   q3, q3, q2                \n"                           \
+  "vmov.u32   q2,   #0                  @ for hardswish \n"           \
+  "vmul.f32   q8, q8, q3                \n"                           \
+  "vadd.f32   q3, q9, q0                \n"                           \
+  "vmul.f32   q9, q9, q1                \n"                           \
+  "vmax.f32   q3, q3, q2                \n"                           \
+  "vld1.f32   {d4-d5}, [%[hs_param]]    \n"                           \
+  "vmin.f32   q3, q3, q2                \n"                           \
+  "vmov.u32   q2,   #0                  @ for hardswish \n"           \
+  "vmul.f32   q9, q9, q3                \n"                           \
+  "vadd.f32   q3, q10, q0               \n"                           \
+  "vmul.f32   q10, q10, q1              \n"                           \
+  "vmax.f32   q3, q3, q2                \n"                           \
+  "vld1.f32   {d4-d5}, [%[hs_param]]    \n"                           \
+  "vmin.f32   q3, q3, q2                \n"                           \
+  "vmov.u32   q2,   #0                  @ for hardswish \n"           \
+  "vmul.f32   q10, q10, q3              \n"                           \
+  "vadd.f32   q3, q11, q0               \n"                           \
+  "vmul.f32   q11, q11, q1              \n"                           \
+  "vmax.f32   q3, q3, q2                \n"                           \
+  "vld1.f32   {d4-d5}, [%[hs_param]]    \n"                           \
+  "vmin.f32   q3, q3, q2                \n"                           \
+  "vmov.u32   q2,   #0                  @ for hardswish \n"           \
+  "vmul.f32   q11, q11, q3              \n"                           \
+  "vadd.f32   q3, q12, q0               \n"                           \
+  "vmul.f32   q12, q12, q1              \n"                           \
+  "vmax.f32   q3, q3, q2                \n"                           \
+  "vld1.f32   {d4-d5}, [%[hs_param]]    \n"                           \
+  "vmin.f32   q3, q3, q2                \n"                           \
+  "vmov.u32   q2,   #0                  @ for hardswish \n"           \
+  "vmul.f32   q12, q12, q3              \n"                           \
+  "vadd.f32   q3, q13, q0               \n"                           \
+  "vmul.f32   q13, q13, q1              \n"                           \
+  "vmax.f32   q3, q3, q2                \n"                           \
+  "vld1.f32   {d4-d5}, [%[hs_param]]    \n"                           \
+  "vmin.f32   q3, q3, q2                \n"                           \
+  "vmov.u32   q2,   #0                  @ for hardswish \n"           \
+  "vmul.f32   q13, q13, q3              \n"                           \
+  "vadd.f32   q3, q14, q0               \n"                           \
+  "vmul.f32   q14, q14, q1              \n"                           \
+  "vmax.f32   q3, q3, q2                \n"                           \
+  "vld1.f32   {d4-d5}, [%[hs_param]]    \n"                           \
+  "vmin.f32   q3, q3, q2                \n"                           \
+  "vmov.u32   q2,   #0                  @ for hardswish \n"           \
+  "vmul.f32   q14, q14, q3              \n"                           \
+  "vadd.f32   q3, q15, q0               \n"                           \
+  "vmul.f32   q15, q15, q1              \n"                           \
+  "vmax.f32   q3, q3, q2                \n"                           \
+  "vld1.f32   {d4-d5}, [%[hs_param]]    \n"                           \
+  "vmin.f32   q3, q3, q2                \n"                           \
+  "vmov.u32   q2,   #0                  @ for hardswish \n"           \
+  "vmul.f32   q15, q15, q3              \n"                           \
+  "9:\n"
+
+#define SPARSE_INT8_INT8_W48_v7_SEMI2_OUT                      \
+  SPARSE_INT8_INT8_W48_v7_SEMI2_KERNEL                         \
+      SPARSE_INT8_TRANS_INT32_TO_INT8_W48_v7_SEMI2             \
+          SPARSE_INT8_INT8_W48_v7_SEMI2_RELU                   \
+              SPARSE_INT8_INT8_W48_v7_SEMI2_RELU6              \
+                  SPARSE_INT8_INT8_W48_v7_SEMI2_LEAKY_RELU     \
+                      SPARSE_INT8_INT8_W48_v7_SEMI2_HARD_SWISH \
+      "vmov.f32  q0, #-0.5\n"    /* neg offset */              \
+      "vmov.f32  q1, #0.5\n"     /* pos offset */              \
+      "vcgt.f32  q2, q4, #0\n"   /* get pos mask */            \
+      "vbif.f32  q1, q0, q2\n"   /* get right offset */        \
+      "vadd.f32  q4, q1, q4\n"   /* add offset */              \
+      "vmov.f32  q2, #0.5\n"     /* pos offset */              \
+      "vcgt.f32  q3, q5, #0\n"   /* get pos mask */            \
+      "vbif.f32  q2, q0, q3\n"   /* get right offset */        \
+      "vadd.f32  q5, q2, q5\n"   /* add offset */              \
+      "vmov.f32  q1, #0.5\n"     /* pos offset */              \
+      "vcgt.f32  q2, q6, #0\n"   /* get pos mask */            \
+      "vbif.f32  q1, q0, q2\n"   /* get right offset */        \
+      "vadd.f32  q6, q1, q6\n"   /* add offset */              \
+      "vmov.f32  q2, #0.5\n"     /* pos offset */              \
+      "vcgt.f32  q3, q7, #0\n"   /* get pos mask */            \
+      "vbif.f32  q2, q0, q3\n"   /* get right offset */        \
+      "vadd.f32  q7, q2, q7\n"   /* add offset */              \
+      "vmov.f32  q1, #0.5\n"     /* pos offset */              \
+      "vcgt.f32  q2, q8, #0\n"   /* get pos mask */            \
+      "vbif.f32  q1, q0, q2\n"   /* get right offset */        \
+      "vadd.f32  q8, q1, q8\n"   /* add offset */              \
+      "vmov.f32  q2, #0.5\n"     /* pos offset */              \
+      "vcgt.f32  q3, q9, #0\n"   /* get pos mask */            \
+      "vbif.f32  q2, q0, q3\n"   /* get right offset */        \
+      "vadd.f32  q9, q2, q9\n"   /* add offset */              \
+      "vmov.f32  q1, #0.5\n"     /* pos offset */              \
+      "vcgt.f32  q2, q10, #0\n"  /* get pos mask */            \
+      "vbif.f32  q1, q0, q2\n"   /* get right offset */        \
+      "vadd.f32  q10, q1, q10\n" /* add offset */              \
+      "vmov.f32  q2, #0.5\n"     /* pos offset */              \
+      "vcgt.f32  q3, q11, #0\n"  /* get pos mask */            \
+      "vbif.f32  q2, q0, q3\n"   /* get right offset */        \
+      "vadd.f32  q11, q2, q11\n" /* add offset */              \
+      "vmov.f32  q1, #0.5\n"     /* pos offset */              \
+      "vcgt.f32  q2, q12, #0\n"  /* get pos mask */            \
+      "vbif.f32  q1, q0, q2\n"   /* get right offset */        \
+      "vadd.f32  q12, q1, q12\n" /* add offset */              \
+      "vmov.f32  q2, #0.5\n"     /* pos offset */              \
+      "vcgt.f32  q3, q13, #0\n"  /* get pos mask */            \
+      "vbif.f32  q2, q0, q3\n"   /* get right offset */        \
+      "vadd.f32  q13, q2, q13\n" /* add offset */              \
+      "vmov.f32  q1, #0.5\n"     /* pos offset */              \
+      "vcgt.f32  q2, q14, #0\n"  /* get pos mask */            \
+      "vbif.f32  q1, q0, q2\n"   /* get right offset */        \
+      "vadd.f32  q14, q1, q14\n" /* add offset */              \
+      "vmov.f32  q2, #0.5\n"     /* pos offset */              \
+      "vcgt.f32  q3, q15, #0\n"  /* get pos mask */            \
+      "vbif.f32  q2, q0, q3\n"   /* get right offset */        \
+      "vadd.f32  q15, q2, q15\n" /* add offset */              \
+      "vld1.f32 {d0-d1}, [%[vmax]] \n"                         \
+      "vcge.f32 q1,  q4, q0\n"                                 \
+      "vcge.f32 q2,  q5, q0\n"                                 \
+      "vcge.f32 q3,  q6, q0\n"                                 \
+      "vbif q4, q0, q1\n"                                      \
+      "vbif q5, q0, q2\n"                                      \
+      "vbif q6, q0, q3\n"                                      \
+      "vcge.f32 q1,  q7, q0\n"                                 \
+      "vcge.f32 q2,  q8, q0\n"                                 \
+      "vcge.f32 q3,  q9, q0\n"                                 \
+      "vbif q7, q0, q1\n"                                      \
+      "vbif q8, q0, q2\n"                                      \
+      "vbif q9, q0, q3\n"                                      \
+      "vcge.f32 q1,  q10, q0\n"                                \
+      "vcge.f32 q2,  q11, q0\n"                                \
+      "vcge.f32 q3,  q12, q0\n"                                \
+      "vbif q10, q0, q1\n"                                     \
+      "vbif q11, q0, q2\n"                                     \
+      "vbif q12, q0, q3\n"                                     \
+      "vcge.f32 q1,  q13, q0\n"                                \
+      "vcge.f32 q2,  q14, q0\n"                                \
+      "vcge.f32 q3,  q15, q0\n"                                \
+      "vbif q13, q0, q1\n"                                     \
+      "vbif q14, q0, q2\n"                                     \
+      "vbif q15, q0, q3\n"                                     \
+      "vcvt.s32.f32   q0, q4\n"  /*     fp32->int32 */         \
+      "vcvt.s32.f32   q1, q5\n"  /*      fp32->int32 */        \
+      "vcvt.s32.f32   q2, q6\n"  /*      fp32->int32 */        \
+      "vcvt.s32.f32   q3, q7\n"  /*      fp32->int32 */        \
+      "vqmovn.s32 d8, q0\n"      /*     int32 -> int16 */      \
+      "vqmovn.s32 d9, q1\n"      /*      int32 -> int16 */     \
+      "vqmovn.s32 d10, q2\n"     /*      int32 -> int16 */     \
+      "vqmovn.s32 d11, q3\n"     /*      int32 -> int16 */     \
+      "vqmovn.s16 d12, q4\n"     /* 0, int16 -> int8 */        \
+      "vqmovn.s16 d13, q5\n"     /* 1, int16 -> int8 */        \
+      "vcvt.s32.f32   q0, q8\n"  /*     fp32->int32 */         \
+      "vcvt.s32.f32   q1, q9\n"  /*      fp32->int32 */        \
+      "vcvt.s32.f32   q2, q10\n" /*      fp32->int32 */        \
+      "vcvt.s32.f32   q3, q11\n" /*      fp32->int32 */        \
+      "vqmovn.s32 d8, q0\n"      /*     int32 -> int16 */      \
+      "vqmovn.s32 d9, q1\n"      /*      int32 -> int16 */     \
+      "vqmovn.s32 d10, q2\n"     /*      int32 -> int16 */     \
+      "vqmovn.s32 d11, q3\n"     /*      int32 -> int16 */     \
+      "vqmovn.s16 d14, q4\n"     /* 0, int16 -> int8 */        \
+      "vqmovn.s16 d15, q5\n"     /* 1, int16 -> int8 */        \
+      "vcvt.s32.f32   q0, q12\n" /*     fp32->int32 */         \
+      "vcvt.s32.f32   q1, q13\n" /*      fp32->int32 */        \
+      "vcvt.s32.f32   q2, q14\n" /*      fp32->int32 */        \
+      "vcvt.s32.f32   q3, q15\n" /*      fp32->int32 */        \
+      "vqmovn.s32 d8, q0\n"      /*     int32 -> int16 */      \
+      "vqmovn.s32 d9, q1\n"      /*      int32 -> int16 */     \
+      "vqmovn.s32 d10, q2\n"     /*      int32 -> int16 */     \
+      "vqmovn.s32 d11, q3\n"     /*      int32 -> int16 */     \
+      "vqmovn.s16 d16, q4\n"     /* 0, int16 -> int8 */        \
+      "vqmovn.s16 d17, q5\n"     /* 1, int16 -> int8 */        \
+      "mov   r0,   %[c_ptr]\n"   /* store result */            \
+      "vst1.32   {d12-d13},  [r0]\n"                           \
+      "add  r0,  r0,   #16\n"                                  \
+      "vst1.32   {d14-d15},  [r0]\n"                           \
+      "add  r0,  r0,   #16\n"                                  \
+      "vst1.32   {d16-d17},  [r0]\n"
+
+#define SPARSE_INT8_INT8_W32_v7_SEMI2_KERNEL \
+  "veor   q8,   q0,  q0\n"                   \
+  "veor   q9,   q1,  q1\n"                   \
+  "pld  [%[a_ptr], #32]    \n"               \
+  "veor   q10,  q2,  q2\n"                   \
+  "veor   q11,  q3,  q3\n"                   \
+  "pld  [%[widx_dmap], #32]    \n"           \
+  "veor   q12,  q4,  q4\n"                   \
+  "veor   q13,  q5,  q5\n"                   \
+  "pld  [%[b_ptr], #64]    \n"               \
+  "veor   q14,  q6,  q6\n"                   \
+  "veor   q15,  q7,  q7\n"                   \
+  "cmp    %[k],    #0\n"                     \
+  "beq    1f\n" /* main loop*/               \
+  "0:\n"                                     \
+  "ldrsb   r0, [%[a_ptr]]\n"                 \
+  "subs    %[k],   %[k],   #1\n"             \
+  "vld1.8  {d2-d5}, [%[b_ptr]]\n"            \
+  "add   %[a_ptr],  %[a_ptr], %[vdif]\n"     \
+  "vdup.8    d0,   r0\n"                     \
+  "vmull.s8    q3,    d2,  d0\n"             \
+  "vmull.s8    q4,    d3,  d0\n"             \
+  "pld  [%[widx_dmap], #32]    \n"           \
+  "vmull.s8    q5,    d4,  d0\n"             \
+  "vmull.s8    q6,    d5,  d0\n"             \
+  "ldr     r1, [%[widx_dmap]],   #4\n"       \
+  "add   %[b_ptr],  %[b_ptr], r1\n"          \
+  "vaddw.s16   q8,     q8,  d6\n"            \
+  "vaddw.s16   q9,     q9,  d7\n"            \
+  "pld  [%[a_ptr], #32]    \n"               \
+  "vaddw.s16   q10,    q10,  d8\n"           \
+  "vaddw.s16   q11,    q11,  d9\n"           \
+  "vaddw.s16   q12,    q12,  d10\n"          \
+  "pld  [%[b_ptr], #64]    \n"               \
+  "vaddw.s16   q13,    q13,  d11\n"          \
+  "vaddw.s16   q14,    q14,  d12\n"          \
+  "vaddw.s16   q15,    q15,  d13\n"          \
+  "bne     0b\n"                             \
+  "1:\n"
+
+#define SPARSE_INT8_TRANS_INT32_TO_INT8_W32_v7_SEMI2 \
+  /* write output */                                 \
+  "vcvt.f32.s32   q0, q8\n"  /* cvt int32 to fp32*/  \
+  "vcvt.f32.s32   q1, q9\n"  /* cvt int32 to fp32*/  \
+  "vcvt.f32.s32   q2, q10\n" /* cvt int32 to fp32*/  \
+  "vcvt.f32.s32   q3, q11\n" /* cvt int32 to fp32*/  \
+  "vld1.32  {d8-d9}, [%[bias_scale]] \n"             \
+  "add %[bias_scale], #16 \n"                        \
+  "vld1.32  {d16-d17}, [%[bias_scale]]    \n"        \
+  "vdup.32    q5,   d8[0]\n"                         \
+  "vdup.32    q6,   d8[0]\n"                         \
+  "vdup.32    q7,   d8[0]\n"                         \
+  "3:\n"                                             \
+  "vmla.f32  q4,  q0,  q8\n"                         \
+  "vmla.f32  q5,  q1,  q8\n"                         \
+  "vmla.f32  q6,  q2,  q8\n"                         \
+  "vmla.f32  q7,  q3,  q8\n"                         \
+  "4:\n"                                             \
+  "vcvt.f32.s32   q8, q12\n"  /* cvt int32 to fp32*/ \
+  "vcvt.f32.s32   q9, q13\n"  /* cvt int32 to fp32*/ \
+  "vcvt.f32.s32   q10, q14\n" /* cvt int32 to fp32*/ \
+  "vcvt.f32.s32   q11, q15\n" /* cvt int32 to fp32*/ \
+  "vld1.32  {d24-d25}, [%[bias_scale]] \n"           \
+  "sub %[bias_scale], #16 \n"                        \
+  "vld1.32  {d0-d1}, [%[bias_scale]]    \n"          \
+  "vdup.32    q1,   d0[0]\n"                         \
+  "vdup.32    q2,   d0[0]\n"                         \
+  "vdup.32    q3,   d0[0]\n"                         \
+  "6:\n"                                             \
+  "vmla.f32  q0,  q8,  q12\n"                        \
+  "vmla.f32  q1,  q9,  q12\n"                        \
+  "vmla.f32  q2,  q10,  q12\n"                       \
+  "vmla.f32  q3,  q11,  q12\n"
+
+#define SPARSE_INT8_INT8_W32_v7_SEMI2_RELU            \
+  /* do relu */                                       \
+  "cmp    %[vflag_act],    #0\n"     /* skip relu */  \
+  "beq   9f                     \n"  /* no act end */ \
+  "cmp    %[vflag_act],    #1\n"     /* skip relu */  \
+  "bne   10f                     \n" /* other act */  \
+  "vmov.i32   q8, #0\n"              /* for relu */   \
+  "vmax.f32   q0,   q0,   q8\n"      /* relu */       \
+  "vmax.f32   q1,   q1,   q8\n"      /* relu */       \
+  "vmax.f32   q2,   q2,   q8\n"      /* relu */       \
+  "vmax.f32   q3,   q3,   q8\n"      /* relu */       \
+  "vmax.f32   q4,   q4,   q8\n"      /* relu */       \
+  "vmax.f32   q5,   q5,   q8\n"      /* relu */       \
+  "vmax.f32   q6,   q6,   q8\n"      /* relu */       \
+  "vmax.f32   q7,   q7,   q8\n"      /* relu */       \
+  "b      9f                    \n"  /* relu end */
+
+#define SPARSE_INT8_INT8_W32_v7_SEMI2_RELU6            \
+  /* do relu6 */                                       \
+  "10: \n"                                             \
+  "cmp   %[vflag_act],  #2       \n" /* check relu6 */ \
+  "bne   11f                     \n" /* no act end */  \
+  "vmov.i32   q8,   #0\n"            /* for relu6 */   \
+  "vdup.32    q9,   %[valpha]\n"     /* relu6 alpha */ \
+  "vmax.f32   q0,   q0,   q8\n"      /* relu6 */       \
+  "vmax.f32   q1,   q1,   q8\n"      /* relu6 */       \
+  "vmax.f32   q2,   q2,   q8\n"      /* relu6 */       \
+  "vmax.f32   q3,   q3,   q8\n"      /* relu6 */       \
+  "vmax.f32   q4,   q4,   q8\n"      /* relu6 */       \
+  "vmax.f32   q5,   q5,   q8\n"      /* relu6 */       \
+  "vmax.f32   q6,   q6,   q8\n"      /* relu6 */       \
+  "vmax.f32   q7,   q7,   q8\n"      /* relu6 */       \
+  "vmin.f32   q0,   q0,   q9\n"      /* relu6 */       \
+  "vmin.f32   q1,   q1,   q9\n"      /* relu6 */       \
+  "vmin.f32   q2,   q2,   q9\n"      /* relu6 */       \
+  "vmin.f32   q3,   q3,   q9\n"      /* relu6 */       \
+  "vmin.f32   q4,   q4,   q9\n"      /* relu6 */       \
+  "vmin.f32   q5,   q5,   q9\n"      /* relu6 */       \
+  "vmin.f32   q6,   q6,   q9\n"      /* relu6 */       \
+  "vmin.f32   q7,   q7,   q9\n"      /* relu6 */       \
+  "b      9f                    \n"  /* relu end */
+
+#define SPARSE_INT8_INT8_W32_v7_SEMI2_LEAKY_RELU              \
+  /* do relu */                                               \
+  "11: \n"                                                    \
+  "cmp   %[vflag_act],  #3       \n"  /* check leakey relu */ \
+  "bne   12f                     \n"  /* no act end */        \
+  "vmov.i32   q8, #0\n"               /* for relu */          \
+  "vdup.32    q9,  %[valpha]\n"       /* leakey relu alpha */ \
+  "vcge.f32   q10,    q0,    q8   \n" /* vcgeq_f32 */         \
+  "vmul.f32   q11,    q0,    q9   \n" /* vmulq_f32 */         \
+  "vcge.f32   q12,    q1,    q8   \n" /* vcgeq_f32 */         \
+  "vmul.f32   q13,    q1,    q9   \n" /* vmulq_f32 */         \
+  "vcge.f32   q14,    q2,    q8   \n" /* vcgeq_f32 */         \
+  "vmul.f32   q15,    q2,    q9   \n" /* vmulq_f32 */         \
+  "vbif       q0,    q11,    q10   \n"                        \
+  "vbif       q1,    q13,    q12   \n"                        \
+  "vbif       q2,    q15,    q14   \n"                        \
+  "vcge.f32   q10,    q3,    q8   \n" /* vcgeq_f32 */         \
+  "vmul.f32   q11,    q3,    q9   \n" /* vmulq_f32 */         \
+  "vcge.f32   q12,    q4,    q8   \n" /* vcgeq_f32 */         \
+  "vmul.f32   q13,    q4,    q9   \n" /* vmulq_f32 */         \
+  "vcge.f32   q14,    q5,    q8   \n" /* vcgeq_f32 */         \
+  "vmul.f32   q15,    q5,    q9   \n" /* vmulq_f32 */         \
+  "vbif       q3,    q11,    q10   \n"                        \
+  "vbif       q4,    q13,    q12   \n"                        \
+  "vbif       q5,    q15,    q14   \n"                        \
+  "vcge.f32   q10,    q6,    q8   \n" /* vcgeq_f32 */         \
+  "vmul.f32   q11,    q6,    q9   \n" /* vmulq_f32 */         \
+  "vcge.f32   q12,    q7,    q8   \n" /* vcgeq_f32 */         \
+  "vmul.f32   q13,    q7,    q9   \n" /* vmulq_f32 */         \
+  "vbif       q6,    q11,    q10   \n"                        \
+  "vbif       q7,    q13,    q12   \n"                        \
+  "b      9f                    \n"
+
+#define SPARSE_INT8_INT8_W32_v7_SEMI2_HARD_SWISH \
+  "12: \n"                                       \
+  "vmov.u32   q8,   #0                     \n"   \
+  "vld1.f32   {d18-d21}, [%[hs_param]]!      \n" \
+  "vld1.f32   {d22-d23}, [%[hs_param]]       \n" \
+  "vadd.f32   q12, q0, q9                 \n"    \
+  "vadd.f32   q14, q1, q9                 \n"    \
+  "vmul.f32   q13, q0, q10                \n"    \
+  "vmul.f32   q14, q1, q10                \n"    \
+  "vmax.f32   q12, q12, q8                \n"    \
+  "vmax.f32   q14, q14, q8                \n"    \
+  "vmin.f32   q12, q12, q11               \n"    \
+  "vmin.f32   q14, q14, q11               \n"    \
+  "vmul.f32   q0,  q13, q12               \n"    \
+  "vmul.f32   q1,  q15, q14               \n"    \
+  "vadd.f32   q12, q2, q9                 \n"    \
+  "vadd.f32   q14, q3, q9                 \n"    \
+  "vmul.f32   q13, q2, q10                \n"    \
+  "vmul.f32   q14, q3, q10                \n"    \
+  "vmax.f32   q12, q12, q8                \n"    \
+  "vmax.f32   q14, q14, q8                \n"    \
+  "vmin.f32   q12, q12, q11               \n"    \
+  "vmin.f32   q14, q14, q11               \n"    \
+  "vmul.f32   q2,  q13, q12               \n"    \
+  "vmul.f32   q3,  q15, q14               \n"    \
+  "vadd.f32   q12, q4, q9                 \n"    \
+  "vadd.f32   q14, q5, q9                 \n"    \
+  "vmul.f32   q13, q4, q10                \n"    \
+  "vmul.f32   q14, q5, q10                \n"    \
+  "vmax.f32   q12, q12, q8                \n"    \
+  "vmax.f32   q14, q14, q8                \n"    \
+  "vmin.f32   q12, q12, q11               \n"    \
+  "vmin.f32   q14, q14, q11               \n"    \
+  "vmul.f32   q4,  q13, q12               \n"    \
+  "vmul.f32   q5,  q15, q14               \n"    \
+  "vadd.f32   q12, q6, q9                 \n"    \
+  "vadd.f32   q14, q7, q9                 \n"    \
+  "vmul.f32   q13, q6, q10                \n"    \
+  "vmul.f32   q14, q7, q10                \n"    \
+  "vmax.f32   q12, q12, q8                \n"    \
+  "vmax.f32   q14, q14, q8                \n"    \
+  "vmin.f32   q12, q12, q11               \n"    \
+  "vmin.f32   q14, q14, q11               \n"    \
+  "vmul.f32   q6,  q13, q12               \n"    \
+  "vmul.f32   q7,  q15, q14               \n"    \
+  "9:\n"
+
+#define SPARSE_INT8_INT8_W32_v7_SEMI2_OUT                                      \
+  SPARSE_INT8_INT8_W32_v7_SEMI2_KERNEL                                         \
+      SPARSE_INT8_TRANS_INT32_TO_INT8_W32_v7_SEMI2                             \
+          SPARSE_INT8_INT8_W32_v7_SEMI2_RELU                                   \
+              SPARSE_INT8_INT8_W32_v7_SEMI2_RELU6                              \
+                  SPARSE_INT8_INT8_W32_v7_SEMI2_LEAKY_RELU                     \
+                      SPARSE_INT8_INT8_W32_v7_SEMI2_HARD_SWISH                 \
+      "vmov.f32  q8, #-0.5\n"    /* neg offset */                              \
+      "vmov.f32  q10, #0.5\n"    /* pos offset */                              \
+      "vmov.f32  q11, #0.5\n"    /* pos offset */                              \
+      "vmov.f32  q12, #0.5\n"    /* pos offset */                              \
+      "vmov.f32  q13, #0.5\n"    /* pos offset */                              \
+      "vcgt.f32  q14, q0, #0\n"  /* get pos mask */                            \
+      "vcgt.f32  q15, q1, #0\n"  /* get pos mask */                            \
+      "vbif.f32  q10, q8, q14\n" /* get right offset */                        \
+      "vbif.f32  q11, q8, q15\n" /* get right offset */                        \
+      "vcgt.f32  q14, q2, #0\n"  /* get pos mask */                            \
+      "vcgt.f32  q15, q3, #0\n"  /* get pos mask */                            \
+      "vbif.f32  q12, q8, q14\n" /* get right offset */                        \
+      "vbif.f32  q13, q8, q15\n" /* get right offset */                        \
+      "vadd.f32  q0, q10, q0\n"  /* add offset */                              \
+      "vadd.f32  q1, q11, q1\n"  /* add offset */                              \
+      "vadd.f32  q2, q12, q2\n"  /* add offset */                              \
+      "vadd.f32  q3, q13, q3\n"  /* add offset */                              \
+      "vmov.f32  q10, #0.5\n"    /* pos offset */                              \
+      "vmov.f32  q11, #0.5\n"    /* pos offset */                              \
+      "vmov.f32  q12, #0.5\n"    /* pos offset */                              \
+      "vmov.f32  q13, #0.5\n"    /* pos offset */                              \
+      "vcgt.f32  q14, q4, #0\n"  /* get pos mask */                            \
+      "vcgt.f32  q15, q5, #0\n"  /* get pos mask */                            \
+      "vbif.f32  q10, q8, q14\n" /* get right offset */                        \
+      "vbif.f32  q11, q8, q15\n" /* get right offset */                        \
+      "vcgt.f32  q14, q6, #0\n"  /* get pos mask */                            \
+      "vcgt.f32  q15, q7, #0\n"  /* get pos mask */                            \
+      "vbif.f32  q12, q8, q14\n" /* get right offset */                        \
+      "vbif.f32  q13, q8, q15\n" /* get right offset */                        \
+      "vadd.f32  q4, q10, q4\n"  /*      add offset */                         \
+      "vadd.f32  q5, q11, q5\n"  /*      add offset */                         \
+      "vadd.f32  q6, q12, q6\n"  /*      add offset */                         \
+      "vadd.f32  q7, q13, q7\n"  /*      add offset */                         \
+      "vld1.f32 {d16-d17}, [%[vmax]] \n"                                       \
+      "vcge.f32 q9,  q0, q8\n"                       /* @ q8 >= -127 \n */     \
+      "vcge.f32 q10, q1, q8\n"                       /* @ q9 >= -127 \n */     \
+      "vcge.f32 q11, q2, q8\n"                       /* @ q0 >= -127 \n */     \
+      "vcge.f32 q12, q3, q8\n"                       /* @ q1 >= -127 \n */     \
+      "vcge.f32 q13, q4, q8\n"                       /* @ q2 >= -127 \n */     \
+      "vcge.f32 q14, q5, q8\n"                       /* @ q3 >= -127 \n */     \
+      "vcge.f32 q15, q6, q8\n" /* @ q4 >= -127 \n */ /* choose data */         \
+      "vbif q0, q8, q9\n"                            /* @ choose */            \
+      "vcge.f32 q9,  q7, q8\n"                       /* @ q5 >= -127 \n */     \
+      "vbif q1, q8, q10\n"                           /* @ choose */            \
+      "vbif q2, q8, q11\n"                           /* @ choose */            \
+      "vbif q3, q8, q12\n"                           /* @ choose */            \
+      "vbif q4, q8, q13\n"                           /* @ choose */            \
+      "vbif q5, q8, q14\n"                           /* @ choose */            \
+      "vbif q6, q8, q15\n"                           /* @ choose */            \
+      "vbif q7, q8, q9\n"                            /* @ choose */            \
+      "vcvt.s32.f32   q8, q0\n"                      /*     fp32->int32 */     \
+      "vcvt.s32.f32   q9, q1\n"                      /*      fp32->int32 */    \
+      "vcvt.s32.f32   q10, q2\n"                     /*      fp32->int32 */    \
+      "vcvt.s32.f32   q11, q3\n"                     /*      fp32->int32 */    \
+      "vcvt.s32.f32   q12, q4\n"                     /*      fp32->int32 */    \
+      "vcvt.s32.f32   q13, q5\n"                     /*      fp32->int32 */    \
+      "vcvt.s32.f32   q14, q6\n"                     /*      fp32->int32 */    \
+      "vcvt.s32.f32   q15, q7\n"                     /*      fp32->int32 */    \
+      "vqmovn.s32 d0, q8\n"                          /*     int32 -> int16 */  \
+      "vqmovn.s32 d1, q9\n"                          /*      int32 -> int16 */ \
+      "vqmovn.s32 d2, q10\n"                         /*      int32 -> int16 */ \
+      "vqmovn.s32 d3, q11\n"                         /*      int32 -> int16 */ \
+      "vqmovn.s32 d4, q12\n"                         /*     int32 -> int16 */  \
+      "vqmovn.s32 d5, q13\n"                         /*      int32 -> int16 */ \
+      "vqmovn.s32 d6, q14\n"                         /*      int32 -> int16 */ \
+      "vqmovn.s32 d7, q15\n"                         /*      int32 -> int16 */ \
+      "vqmovn.s16 d8, q0\n"                          /* 0, int16 -> int8 */    \
+      "vqmovn.s16 d9, q1\n"                          /* 1, int16 -> int8 */    \
+      "vqmovn.s16 d10, q2\n"                         /* 2, int16 -> int8 */    \
+      "vqmovn.s16 d11, q3\n"                         /* 3, int16 -> int8 */    \
+      "mov   r0,   %[c_ptr]\n"                       /* store result */        \
+      "vst1.32   {d10-d11},  [r0]\n"                                           \
+      "add  r0,  r0,   #16\n"                                                  \
+      "vst1.32   {d8-d9},  [r0]\n"
+
+#define SPARSE_INT8_INT8_W16_v7_SEMI2_KERNEL \
+  "veor   q4,  q0,  q0\n"                    \
+  "pld  [%[a_ptr], #32]    \n"               \
+  "veor   q5,  q1,  q1\n"                    \
+  "pld  [%[widx_dmap], #32]    \n"           \
+  "veor   q6,  q2,  q2\n"                    \
+  "pld  [%[b_ptr], #32]    \n"               \
+  "veor   q7,  q3,  q3\n"                    \
+  "cmp    %[k],    #0\n"                     \
+  "beq    1f\n" /* main loop*/               \
+  "0:\n"                                     \
+  "ldrsb   r0, [%[a_ptr]]\n"                 \
+  "subs    %[k],   %[k],   #1\n"             \
+  "vld1.8  {d2-d3}, [%[b_ptr]]\n"            \
+  "add   %[a_ptr],  %[a_ptr], %[vdif]\n"     \
+  "vdup.8    d0,   r0\n"                     \
+  "vmull.s8    q2,    d2,  d0\n"             \
+  "vmull.s8    q3,    d3,  d0\n"             \
+  "ldr     r1, [%[widx_dmap]],   #4\n"       \
+  "add   %[b_ptr],  %[b_ptr], r1\n"          \
+  "vaddw.s16   q4,    q4,  d4\n"             \
+  "vaddw.s16   q5,    q5,  d5\n"             \
+  "pld  [%[b_ptr], #32]    \n"               \
+  "vaddw.s16   q6,    q6,  d6\n"             \
+  "vaddw.s16   q7,    q7,  d7\n"             \
+  "bne     0b\n"                             \
+  "1:\n"
+
+#define SPARSE_INT8_TRANS_INT32_TO_INT8_W16_v7_SEMI2    \
+  /* write output */                                    \
+  "vcvt.f32.s32   q0, q4\n" /*     cvt int32 to fp32*/  \
+  "vcvt.f32.s32   q1, q5\n" /*      cvt int32 to fp32*/ \
+  "vcvt.f32.s32   q2, q6\n" /*     cvt int32 to fp32*/  \
+  "vcvt.f32.s32   q3, q7\n" /*      cvt int32 to fp32*/ \
+  "vld1.32  {d16-d17}, [%[bias_scale]] \n"              \
+  "add %[bias_scale], #16 \n"                           \
+  "vld1.32  {d8-d9}, [%[bias_scale]]    \n"             \
+  "vdup.32    q9,   d16[0]\n"                           \
+  "vdup.32    q10,  d16[0]\n"                           \
+  "vdup.32    q11,  d16[0]\n"                           \
+  "3:\n"                                                \
+  "vmla.f32  q8,   q0,  q4\n"                           \
+  "vmla.f32  q9,   q1,  q4\n"                           \
+  "vmla.f32  q10,  q2,  q4\n"                           \
+  "vmla.f32  q11,  q3,  q4\n"
+
+#define SPARSE_INT8_INT8_W16_v7_SEMI2_RELU            \
+  /* do relu */                                       \
+  "cmp    %[vflag_act],    #0\n"     /* skip relu */  \
+  "beq   9f                     \n"  /* no act end */ \
+  "cmp    %[vflag_act],    #1\n"     /* skip relu */  \
+  "bne   10f                     \n" /* other act */  \
+  "vmov.i32   q0, #0\n"              /* for relu */   \
+  "vmax.f32   q8,   q8,   q0\n"      /* relu */       \
+  "vmax.f32   q9,   q9,   q0\n"      /* relu */       \
+  "vmax.f32   q10,  q10,  q0\n"      /* relu */       \
+  "vmax.f32   q11,  q11,  q0\n"      /* relu */       \
+  "b      9f                    \n"  /* relu end */
+
+#define SPARSE_INT8_INT8_W16_v7_SEMI2_RELU6            \
+  /* do relu6 */                                       \
+  "10: \n"                                             \
+  "cmp   %[vflag_act],  #2       \n" /* check relu6 */ \
+  "bne   11f                     \n" /* no act end */  \
+  "vmov.i32   q0,   #0\n"            /* for relu6 */   \
+  "vdup.32    q1,   %[valpha]\n"     /* relu6 alpha */ \
+  "vmax.f32   q8,   q8,   q0\n"      /* relu6 */       \
+  "vmax.f32   q9,   q9,   q0\n"      /* relu6 */       \
+  "vmax.f32   q10,  q10,  q0\n"      /* relu6 */       \
+  "vmax.f32   q11,  q11,  q0\n"      /* relu6 */       \
+  "vmin.f32   q8,   q8,   q1\n"      /* relu6 */       \
+  "vmin.f32   q9,   q9,   q1\n"      /* relu6 */       \
+  "vmin.f32   q10,  q10,  q1\n"      /* relu6 */       \
+  "vmin.f32   q11,  q11,  q1\n"      /* relu6 */       \
+  "b      9f                    \n"  /* relu end */
+
+#define SPARSE_INT8_INT8_W16_v7_SEMI2_LEAKY_RELU             \
+  /* do relu */                                              \
+  "11: \n"                                                   \
+  "cmp   %[vflag_act],  #3       \n" /* check leakey relu */ \
+  "bne   12f                     \n" /* no act end */        \
+  "vmov.i32   q0, #0\n"              /* for relu */          \
+  "vdup.32    q1,  %[valpha]\n"      /* leakey relu alpha */ \
+  "vcge.f32   q2,    q8,    q0   \n" /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q8,    q1   \n" /* vmulq_f32 */         \
+  "vcge.f32   q4,    q9,    q0   \n" /* vcgeq_f32 */         \
+  "vmul.f32   q5,    q9,    q1   \n" /* vmulq_f32 */         \
+  "vcge.f32   q6,    q10,   q0   \n" /* vcgeq_f32 */         \
+  "vmul.f32   q7,    q10,   q1   \n" /* vmulq_f32 */         \
+  "vbif       q8,    q3,    q2   \n"                         \
+  "vbif       q9,    q5,    q4   \n"                         \
+  "vbif       q10,   q7,    q6   \n"                         \
+  "vcge.f32   q2,    q11,    q0   \n" /* vcgeq_f32 */        \
+  "vmul.f32   q3,    q11,    q1   \n" /* vmulq_f32 */        \
+  "vbif       q11,   q3,    q2   \n"                         \
+  "b      9f                    \n"
+
+#define SPARSE_INT8_INT8_W16_v7_SEMI2_HARD_SWISH \
+  "12: \n"                                       \
+  "vmov.u32   q0,   #0                     \n"   \
+  "vld1.f32   {d2-d5}, [%[hs_param]]!      \n"   \
+  "vld1.f32   {d6-d7}, [%[hs_param]]       \n"   \
+  "vadd.f32   q4, q8, q1                \n"      \
+  "vadd.f32   q6, q9, q1                \n"      \
+  "vmul.f32   q5, q8, q2                \n"      \
+  "vmul.f32   q7, q9, q2                \n"      \
+  "vmax.f32   q4, q4, q0                \n"      \
+  "vmax.f32   q6, q6, q0                \n"      \
+  "vmin.f32   q4, q4, q3                \n"      \
+  "vmin.f32   q6, q6, q3                \n"      \
+  "vmul.f32   q8, q5, q4                \n"      \
+  "vmul.f32   q9, q7, q6                \n"      \
+  "vadd.f32   q4, q10, q1                \n"     \
+  "vadd.f32   q6, q11, q1                \n"     \
+  "vmul.f32   q5, q10, q2                \n"     \
+  "vmul.f32   q7, q11, q2                \n"     \
+  "vmax.f32   q4, q4, q0                \n"      \
+  "vmax.f32   q6, q6, q0                \n"      \
+  "vmin.f32   q4, q4, q3                \n"      \
+  "vmin.f32   q6, q6, q3                \n"      \
+  "vmul.f32   q10, q5, q4                \n"     \
+  "vmul.f32   q11, q7, q6                \n"     \
+  "9:\n"
+
+#define SPARSE_INT8_INT8_W16_v7_SEMI2_OUT                                    \
+  SPARSE_INT8_INT8_W16_v7_SEMI2_KERNEL                                       \
+      SPARSE_INT8_TRANS_INT32_TO_INT8_W16_v7_SEMI2                           \
+          SPARSE_INT8_INT8_W16_v7_SEMI2_RELU                                 \
+              SPARSE_INT8_INT8_W16_v7_SEMI2_RELU6                            \
+                  SPARSE_INT8_INT8_W16_v7_SEMI2_LEAKY_RELU                   \
+                      SPARSE_INT8_INT8_W16_v7_SEMI2_HARD_SWISH               \
+      "vmov.f32  q0, #-0.5\n"    /* neg offset */                            \
+      "vmov.f32  q1, #0.5\n"     /* pos offset */                            \
+      "vmov.f32  q2, #0.5\n"     /* pos offset */                            \
+      "vmov.f32  q3, #0.5\n"     /* pos offset */                            \
+      "vmov.f32  q4, #0.5\n"     /* pos offset */                            \
+      "vcgt.f32  q5, q8, #0\n"   /* get pos mask */                          \
+      "vcgt.f32  q6, q9, #0\n"   /* get pos mask */                          \
+      "vbif.f32  q1, q0, q5\n"   /* get right offset */                      \
+      "vbif.f32  q2, q0, q6\n"   /* get right offset */                      \
+      "vcgt.f32  q12, q10, #0\n" /* get pos mask */                          \
+      "vcgt.f32  q13, q11, #0\n" /* get pos mask */                          \
+      "vbif.f32  q3, q0, q12\n"  /* get right offset */                      \
+      "vbif.f32  q4, q0, q13\n"  /* get right offset */                      \
+      "vadd.f32  q8, q1, q8\n"   /*     add offset */                        \
+      "vadd.f32  q9, q2, q9\n"   /*      add offset */                       \
+      "vadd.f32  q10, q3, q10\n" /*      add offset */                       \
+      "vadd.f32  q11, q4, q11\n" /*      add offset */                       \
+      "vld1.f32 {d14-d15}, [%[vmax]] \n"                                     \
+      "vcge.f32 q0,  q8,  q7\n"                    /* @ q8 >= -127 \n */     \
+      "vcge.f32 q1,  q9,  q7\n"                    /* @ q9 >= -127 \n */     \
+      "vcge.f32 q2,  q10, q7\n"                    /* @ q0 >= -127 \n */     \
+      "vcge.f32 q3,  q11, q7\n"                    /* @ q1 >= -127 \n */     \
+      "vbif q8, q7, q0\n"                          /* @ choose */            \
+      "vbif q9, q7, q1\n"                          /* @ choose */            \
+      "vbif q10, q7, q2\n"                         /* @ choose */            \
+      "vbif q11, q7, q3\n"                         /* @ choose */            \
+      "vcvt.s32.f32   q0, q8\n"                    /*     fp32->int32 */     \
+      "vcvt.s32.f32   q1, q9\n"                    /*      fp32->int32 */    \
+      "vcvt.s32.f32   q2, q10\n"                   /*      fp32->int32 */    \
+      "vcvt.s32.f32   q3, q11\n"                   /*      fp32->int32 */    \
+      "vqmovn.s32 d8, q0\n"                        /*     int32 -> int16 */  \
+      "vqmovn.s32 d9, q1\n"                        /*      int32 -> int16 */ \
+      "vqmovn.s32 d10, q2\n"                       /*      int32 -> int16 */ \
+      "vqmovn.s32 d11, q3\n"                       /*      int32 -> int16 */ \
+      "vqmovn.s16 d0, q4\n"                        /* 0, int16 -> int8 */    \
+      "vqmovn.s16 d1, q5\n" /* 1, int16 -> int8 */ /* store result */        \
+      "vst1.32   {d0-d1},  [%[c_ptr]]\n"
+
+#define SPARSE_INT8_INT8_W8_v7_SEMI2_KERNEL \
+  "veor   q4,  q0,  q0\n"                   \
+  "veor   q5,  q1,  q1\n"                   \
+  "cmp    %[k],    #0\n"                    \
+  "beq    1f\n" /* main loop*/              \
+  "0:\n"                                    \
+  "ldrsb   r0, [%[a_ptr]]\n"                \
+  "subs    %[k],   %[k],   #1\n"            \
+  "add   %[a_ptr],  %[a_ptr], %[vdif]\n"    \
+  "vld1.8  d2, [%[b_ptr]]\n"                \
+  "vdup.8    d0,   r0\n"                    \
+  "vmull.s8    q2,    d2,  d0\n"            \
+  "ldr     r1, [%[widx_dmap]],   #4\n"      \
+  "add   %[b_ptr],  %[b_ptr], r1\n"         \
+  "vaddw.s16   q4,    q4,  d4\n"            \
+  "vaddw.s16   q5,    q5,  d5\n"            \
+  "bne     0b\n"                            \
+  "1:\n"
+
+#define SPARSE_INT8_TRANS_INT32_TO_INT8_W8_v7_SEMI2     \
+  /* write output */                                    \
+  "vcvt.f32.s32   q0, q4\n" /*     cvt int32 to fp32*/  \
+  "vcvt.f32.s32   q1, q5\n" /*      cvt int32 to fp32*/ \
+  "vld1.32  {d12-d13}, [%[bias_scale]] \n"              \
+  "add %[bias_scale], #16 \n"                           \
+  "vld1.32  {d4-d5}, [%[bias_scale]]    \n"             \
+  "vdup.32    q7,   d12[0]\n"                           \
+  "3:\n"                                                \
+  "vmla.f32  q6,   q0,  q2\n"                           \
+  "vmla.f32  q7,   q1,  q2\n"
+
+#define SPARSE_INT8_INT8_W8_v7_SEMI2_RELU             \
+  /* do relu */                                       \
+  "cmp    %[vflag_act],    #0\n"     /* skip relu */  \
+  "beq   9f                     \n"  /* no act end */ \
+  "cmp    %[vflag_act],    #1\n"     /* skip relu */  \
+  "bne   10f                     \n" /* other act */  \
+  "vmov.i32   q0, #0\n"              /* for relu */   \
+  "vmax.f32   q6,   q6,   q0\n"      /* relu */       \
+  "vmax.f32   q7,   q7,   q0\n"      /* relu */       \
+  "b      9f                    \n"  /* relu end */
+
+#define SPARSE_INT8_INT8_W8_v7_SEMI2_RELU6             \
+  /* do relu6 */                                       \
+  "10: \n"                                             \
+  "cmp   %[vflag_act],  #2       \n" /* check relu6 */ \
+  "bne   11f                     \n" /* no act end */  \
+  "vmov.i32   q0,   #0\n"            /* for relu6 */   \
+  "vdup.32    q1,   %[valpha]\n"     /* relu6 alpha */ \
+  "vmax.f32   q6,   q6,   q0\n"      /* relu6 */       \
+  "vmax.f32   q7,   q7,   q0\n"      /* relu6 */       \
+  "vmin.f32   q6,   q6,   q1\n"      /* relu6 */       \
+  "vmin.f32   q7,   q7,   q1\n"      /* relu6 */       \
+  "b      9f                    \n"  /* relu end */
+
+#define SPARSE_INT8_INT8_W8_v7_SEMI2_LEAKY_RELU              \
+  /* do relu */                                              \
+  "11: \n"                                                   \
+  "cmp   %[vflag_act],  #3       \n" /* check leakey relu */ \
+  "bne   12f                     \n" /* no act end */        \
+  "vmov.i32   q0, #0\n"              /* for relu */          \
+  "vdup.32    q1,  %[valpha]\n"      /* leakey relu alpha */ \
+  "vcge.f32   q2,    q6,    q0   \n" /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q6,    q1   \n" /* vmulq_f32 */         \
+  "vcge.f32   q4,    q7,    q0   \n" /* vcgeq_f32 */         \
+  "vmul.f32   q5,    q7,    q1   \n" /* vmulq_f32 */         \
+  "vbif       q6,    q3,    q2   \n"                         \
+  "vbif       q7,    q5,    q4   \n"                         \
+  "b      9f                    \n"
+
+#define SPARSE_INT8_INT8_W8_v7_SEMI2_HARD_SWISH \
+  "12: \n"                                      \
+  "vmov.u32   q0,   #0                     \n"  \
+  "vld1.f32   {d2-d5}, [%[hs_param]]!      \n"  \
+  "vld1.f32   {d6-d7}, [%[hs_param]]       \n"  \
+  "vadd.f32   q4, q6, q1                \n"     \
+  "vadd.f32   q8, q7, q1                \n"     \
+  "vmul.f32   q5, q6, q2                \n"     \
+  "vmul.f32   q9, q7, q2                \n"     \
+  "vmax.f32   q4, q4, q0                \n"     \
+  "vmax.f32   q8, q8, q0                \n"     \
+  "vmin.f32   q4, q4, q3                \n"     \
+  "vmin.f32   q8, q8, q3                \n"     \
+  "vmul.f32   q6, q5, q4                \n"     \
+  "vmul.f32   q7, q9, q8                \n"     \
+  "9:\n"
+
+#define SPARSE_INT8_INT8_W8_v7_SEMI2_OUT                                       \
+  SPARSE_INT8_INT8_W8_v7_SEMI2_KERNEL                                          \
+      SPARSE_INT8_TRANS_INT32_TO_INT8_W8_v7_SEMI2                              \
+          SPARSE_INT8_INT8_W8_v7_SEMI2_RELU SPARSE_INT8_INT8_W8_v7_SEMI2_RELU6 \
+              SPARSE_INT8_INT8_W8_v7_SEMI2_LEAKY_RELU                          \
+                  SPARSE_INT8_INT8_W8_v7_SEMI2_HARD_SWISH                      \
+      "vmov.f32  q8, #-0.5\n"  /* neg offset */                                \
+      "vmov.f32  q0, #0.5\n"   /* pos offset */                                \
+      "vmov.f32  q1, #0.5\n"   /* pos offset */                                \
+      "vmov.f32  q2, #0.5\n"   /* pos offset */                                \
+      "vmov.f32  q3, #0.5\n"   /* pos offset */                                \
+      "vcgt.f32  q4, q6, #0\n" /* get pos mask */                              \
+      "vcgt.f32  q5, q7, #0\n" /* get pos mask */                              \
+      "vbif.f32  q0, q8, q4\n" /* get right offset */                          \
+      "vbif.f32  q1, q8, q5\n" /* get right offset */                          \
+      "vadd.f32  q6, q0, q6\n" /*     add offset */                            \
+      "vadd.f32  q7, q1, q7\n" /*      add offset */                           \
+      "vld1.f32 {d0-d1}, [%[vmax]] \n"                                         \
+      "vcge.f32 q1,  q6,  q0\n"                    /* @ q8 >= -127 \n */       \
+      "vcge.f32 q2,  q7,  q0\n"                    /* @ q9 >= -127 \n */       \
+      "vbif q6, q0, q1\n"                          /* @ choose */              \
+      "vbif q7, q0, q2\n"                          /* @ choose */              \
+      "vcvt.s32.f32   q0, q6\n"                    /*     fp32->int32 */       \
+      "vcvt.s32.f32   q1, q7\n"                    /*      fp32->int32 */      \
+      "vqmovn.s32 d8, q0\n"                        /*     int32 -> int16 */    \
+      "vqmovn.s32 d9, q1\n"                        /*      int32 -> int16 */   \
+      "vqmovn.s16 d0, q4\n" /* 0, int16 -> int8 */ /* store result */          \
+      "vst1.32   {d0},  [%[c_ptr]]\n"
+
+#define SPARSE_INT8_INT8_W4_v7_SEMI2_KERNEL \
+  "veor   q3,  q0,  q0\n"                   \
+  "cmp    %[k],    #0\n"                    \
+  "beq    1f\n" /* main loop*/              \
+  "0:\n"                                    \
+  "ldrsb   r0, [%[a_ptr]]\n"                \
+  "vdup.8     d0,    r0\n"                  \
+  "subs    %[k],   %[k],   #1\n"            \
+  "ldr     r0, [%[b_ptr]]\n"                \
+  "add   %[a_ptr],  %[a_ptr], %[vdif]\n"    \
+  "vdup.32    d2,   r0\n"                   \
+  "vmull.s8    q2,    d2,  d0\n"            \
+  "ldr     r1, [%[widx_dmap]],   #4\n"      \
+  "add   %[b_ptr],  %[b_ptr], r1\n"         \
+  "vaddw.s16   q3,    q3,  d4\n"            \
+  "bne     0b\n"                            \
+  "1:\n"
+
+#define SPARSE_INT8_TRANS_INT32_TO_INT8_W4_v7_SEMI2    \
+  /* write output */                                   \
+  "vcvt.f32.s32   q0, q3\n" /*     cvt int32 to fp32*/ \
+  "vld1.32  {d8-d9}, [%[bias_scale]] \n"               \
+  "add %[bias_scale], #16 \n"                          \
+  "vld1.32  {d2-d3}, [%[bias_scale]]    \n"            \
+  "3:\n"                                               \
+  "vmla.f32  q4,   q0,  q1\n"
+
+#define SPARSE_INT8_INT8_W4_v7_SEMI2_RELU             \
+  /* do relu */                                       \
+  "cmp    %[vflag_act],    #0\n"     /* skip relu */  \
+  "beq   9f                     \n"  /* no act end */ \
+  "cmp    %[vflag_act],    #1\n"     /* skip relu */  \
+  "bne   10f                     \n" /* other act */  \
+  "vmov.i32   q0, #0\n"              /* for relu */   \
+  "vmax.f32   q4,   q4,   q0\n"      /* relu */       \
+  "b      9f                    \n"  /* relu end */
+
+#define SPARSE_INT8_INT8_W4_v7_SEMI2_RELU6             \
+  /* do relu6 */                                       \
+  "10: \n"                                             \
+  "cmp   %[vflag_act],  #2       \n" /* check relu6 */ \
+  "bne   11f                     \n" /* no act end */  \
+  "vmov.i32   q0,   #0\n"            /* for relu6 */   \
+  "vdup.32    q1,   %[valpha]\n"     /* relu6 alpha */ \
+  "vmax.f32   q4,   q4,   q0\n"      /* relu6 */       \
+  "vmin.f32   q4,   q4,   q1\n"      /* relu6 */       \
+  "b      9f                    \n"  /* relu end */
+
+#define SPARSE_INT8_INT8_W4_v7_SEMI2_LEAKY_RELU              \
+  /* do relu */                                              \
+  "11: \n"                                                   \
+  "cmp   %[vflag_act],  #3       \n" /* check leakey relu */ \
+  "bne   12f                     \n" /* no act end */        \
+  "vmov.i32   q0, #0\n"              /* for relu */          \
+  "vdup.32    q1,  %[valpha]\n"      /* leakey relu alpha */ \
+  "vcge.f32   q2,    q4,    q0   \n" /* vcgeq_f32 */         \
+  "vmul.f32   q3,    q4,    q1   \n" /* vmulq_f32 */         \
+  "vbif       q4,    q3,    q2   \n"                         \
+  "b      9f                    \n"
+
+#define SPARSE_INT8_INT8_W4_v7_SEMI2_HARD_SWISH \
+  "12: \n"                                      \
+  "vmov.u32   q0,   #0                     \n"  \
+  "vld1.f32   {d2-d5}, [%[hs_param]]!      \n"  \
+  "vld1.f32   {d6-d7}, [%[hs_param]]       \n"  \
+  "vadd.f32   q5, q4, q1                \n"     \
+  "vmul.f32   q6, q4, q2                \n"     \
+  "vmax.f32   q5, q5, q0                \n"     \
+  "vmin.f32   q5, q5, q3                \n"     \
+  "vmul.f32   q4, q6, q5                \n"     \
+  "9:\n"
+
+#define SPARSE_INT8_INT8_W4_v7_SEMI2_OUT                                       \
+  SPARSE_INT8_INT8_W4_v7_SEMI2_KERNEL                                          \
+      SPARSE_INT8_TRANS_INT32_TO_INT8_W4_v7_SEMI2                              \
+          SPARSE_INT8_INT8_W4_v7_SEMI2_RELU SPARSE_INT8_INT8_W4_v7_SEMI2_RELU6 \
+              SPARSE_INT8_INT8_W4_v7_SEMI2_LEAKY_RELU                          \
+                  SPARSE_INT8_INT8_W4_v7_SEMI2_HARD_SWISH                      \
+      "vmov.f32  q1, #-0.5\n"  /* neg offset */                                \
+      "vmov.f32  q0, #0.5\n"   /* pos offset */                                \
+      "vcgt.f32  q2, q4, #0\n" /* get pos mask */                              \
+      "vbif.f32  q0, q1, q2\n" /* get right offset */                          \
+      "vadd.f32  q4, q0, q4\n" /*     add offset */                            \
+      "vld1.f32 {d0-d1}, [%[vmax]] \n"                                         \
+      "vcge.f32 q1,  q4,  q0\n"                    /* @ q8 >= -127 \n */       \
+      "vbif q4, q0, q1\n"                          /* @ choose */              \
+      "vcvt.s32.f32   q0, q4\n"                    /*     fp32->int32 */       \
+      "vqmovn.s32 d2, q0\n"                        /*     int32 -> int16 */    \
+      "vqmovn.s16 d0, q1\n" /* 0, int16 -> int8 */ /* store result */          \
+      "vst1.32   d0[0],  [%[c_ptr]]\n"
+
+#define GET_UNSTRUCT_INT8_V7_PARAM_TABLE(Dtype_i, Dtype_o)                     \
+  Dtype_o* out_ptr = reinterpret_cast<Dtype_o*>((uintptr_t)output +            \
+                                                output_stride * 2 * pair_num); \
+  const Dtype_i* cur_w = A;                                                    \
+  uint32_t nnz = nidx_nnzmap[pair_num];                                        \
+  const Dtype_i* cur_b = B;                                                    \
+  const int32_t* dmap = widx_dmap;                                             \
+  if (pair_num != 0) {                                                         \
+    cur_w = A + nidx_nnzmap[pair_num - 1] * 2;                                 \
+    nnz = nidx_nnzmap[pair_num] - nidx_nnzmap[pair_num - 1];                   \
+    cur_b +=                                                                   \
+        ((nidx_nnzmap[pair_num - 1] == 0)                                      \
+             ? 0                                                               \
+             : widx_dmap[nidx_nnzmap[pair_num - 1] - 1] / sizeof(Dtype_i));    \
+    dmap = widx_dmap + nidx_nnzmap[pair_num - 1];                              \
+  }                                                                            \
+  float* bias_scale = vbias_scale;                                             \
+  if (bias) {                                                                  \
+    bias_scale[0] = bias[pair_num * 2];                                        \
+    bias_scale[1] = bias_scale[0];                                             \
+    bias_scale[2] = bias_scale[0];                                             \
+    bias_scale[3] = bias_scale[0];                                             \
+  }                                                                            \
+  bias_scale[4] = scale[pair_num * 2];                                         \
+  bias_scale[5] = bias_scale[4];                                               \
+  bias_scale[6] = bias_scale[4];                                               \
+  bias_scale[7] = bias_scale[4];                                               \
+  float* hs_param = vhs_param;
+
+#define SET_ASSM_INPUT_PARAM1_1_V7_INT8_INT8 \
+: [a_ptr] "+r"(cur_w),                          \
+  [b_ptr] "+r"(cur_b),                          \
+  [c_ptr] "+r"(out_ptr1),                       \
+  [widx_dmap] "+r"(dmap),                       \
+  [k] "+r"(nnz),                                \
+  [bias_scale] "+r"(bias_scale),                \
+  [hs_param] "+r"(hs_param)                     \
+: [vflag_act] "r"(flag_act),                    \
+  [vmax] "r"(vmax),                             \
+  [valpha] "r"(alpha),                          \
+  [vdif] "r"(2)
+
+#define SET_ASSM_INPUT_PARAM1_2_V7_INT8_INT8 \
+: [a_ptr] "+r"(cur_w2),                         \
+  [b_ptr] "+r"(cur_b2),                         \
+  [c_ptr] "+r"(out_ptr2),                       \
+  [widx_dmap] "+r"(dmap2),                      \
+  [k] "+r"(nnz2),                               \
+  [bias_scale] "+r"(bias_scale),                \
+  [hs_param] "+r"(hs_param)                     \
+: [vflag_act] "r"(flag_act),                    \
+  [vmax] "r"(vmax),                             \
+  [valpha] "r"(alpha),                          \
+  [vdif] "r"(2)
+
+#define SET_ASSM_INPUT_PARAM2_V7_INT8_INT8 \
+: [a_ptr] "+r"(cur_w),                          \
+  [b_ptr] "+r"(cur_b),                          \
+  [c_ptr] "+r"(out_ptr),                        \
+  [widx_dmap] "+r"(dmap),                       \
+  [k] "+r"(nnz),                                \
+  [bias_scale] "+r"(bias_scale),                \
+  [hs_param] "+r"(hs_param)                     \
+: [vflag_act] "r"(flag_act),                    \
+  [vmax] "r"(vmax),                             \
+  [valpha] "r"(alpha),                          \
+  [vdif] "r"(1)
+
+/**
+ * \brief Sparse calculation implementation of 1x1 convolution, both input and
+ * output are int8.
+ * Sparse matrix multiplication is calculated in blocks, the block size is Mx48,
+ * that is,
+ * the parameter matrix is MxK, and the activation matrix is Kx48; when N is
+ * less than 48,
+ * it is calculated in blocks of Mx32, Mx16, Mx8, and Mx4 in turn;
+ * \brief the input type is int8, and the output type is also int8.
+ * @param A sparse weight data
+ * @param B dense input data
+ * @param widx_dmap An array of int32_t values storing scaled [by sizeof(input
+ * element)] difference
+ * between input channels corresponding to successive non-zero element
+ * @param nidx_nnzmap the number of non-zero kernel elements per each output
+ * channel
+ * @param bias
+ * @param output
+ * @param M
+ * @param N
+ * @param K
+ * @param param
+ * @param ctx
+ */
+void sparse_semi_conv_int8_int8_pipelined(
+    const int8_t* A,
+    const int8_t* B,
+    const int32_t* widx_dmap,
+    const uint32_t* nidx_nnzmap,
+    const float* bias,
+    const float* scale,
+    int8_t* output,
+    int M,
+    int K,
+    int N,
+    const operators::SparseConvParam& param,
+    ARMContext* ctx) {
+  auto act_param = param.activation_param;
+  auto act_type = act_param.active_type;
+  volatile float alpha = 0.f;
+  float vhs_param[12] = {0.f};
+  int flag_act = 0x00;  // relu: 1, relu6: 2, leakey: 3
+  if (act_param.has_active) {
+    if (act_type == lite_api::ActivationType::kRelu) {
+      flag_act = 0x01;
+    } else if (act_type == lite_api::ActivationType::kRelu6) {
+      flag_act = 0x02;
+      alpha = act_param.Relu_clipped_coef;
+    } else if (act_type == lite_api::ActivationType::kLeakyRelu) {
+      flag_act = 0x03;
+      alpha = act_param.Leaky_relu_alpha;
+    } else if (act_type == lite_api::ActivationType::kHardSwish) {
+      flag_act = 0x04;
+      for (int i = 0; i < 4; i++) {
+        vhs_param[i] = act_param.hard_swish_offset / param.output_scale;
+        vhs_param[i + 4] = 1.0 / act_param.hard_swish_scale;
+        vhs_param[i + 8] = act_param.hard_swish_threshold / param.output_scale;
+      }
+    }
+  }
+  int flag_bias = (bias != nullptr) ? 1 : 0;
+  size_t mc = N * sizeof(int8_t);
+  size_t nc = M;
+  size_t output_stride = N * sizeof(int8_t);
+  size_t output_decrement = output_stride * nc - 48 * sizeof(int8_t);
+  size_t pair_num = M / 2;
+  size_t lave_num = M % 2;
+  float vmax[4] = {-127.0, -127.0, -127.0, -127.0};
+  float vbias_scale[8] = {0.f};
+  while
+    SPARSE_LIKELY(mc >= 48 * sizeof(int8_t)) {
+      LITE_PARALLEL_COMMON_BEGIN(i, tid, pair_num, 0, 1) {
+        GET_SEMI_INT8_V7_PARAM_TABLE(int8_t, int8_t)
+        float* bias_scale = vbias_scale;
+        if (bias) {
+          bias_scale[0] = bias[i * 2];
+          bias_scale[1] = bias_scale[0];
+          bias_scale[2] = bias_scale[0];
+          bias_scale[3] = bias_scale[0];
+        }
+        bias_scale[4] = scale[i * 2];
+        bias_scale[5] = bias_scale[4];
+        bias_scale[6] = bias_scale[4];
+        bias_scale[7] = bias_scale[4];
+        float* hs_param = vhs_param;
+        // clang-format off
+        asm volatile(SPARSE_INT8_INT8_W48_v7_SEMI2_OUT
+          SET_ASSM_INPUT_PARAM1_1_V7_INT8_INT8
+          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12",
+            "q13", "q14", "q15", "r0", "r1", "cc", "memory");
+        // clang-format on
+        bias_scale = vbias_scale;
+        if (bias) {
+          bias_scale[0] = bias[i * 2 + 1];
+          bias_scale[1] = bias_scale[0];
+          bias_scale[2] = bias_scale[0];
+          bias_scale[3] = bias_scale[0];
+        }
+        bias_scale[4] = scale[i * 2 + 1];
+        bias_scale[5] = bias_scale[4];
+        bias_scale[6] = bias_scale[4];
+        bias_scale[7] = bias_scale[4];
+        hs_param = vhs_param;
+        // clang-format off
+        asm volatile(SPARSE_INT8_INT8_W48_v7_SEMI2_OUT
+          SET_ASSM_INPUT_PARAM1_2_V7_INT8_INT8
+          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12",
+            "q13", "q14", "q15", "r0", "r1", "cc", "memory");
+        // clang-format on
+      }
+      LITE_PARALLEL_COMMON_END();
+      if
+        SPARSE_UNLIKELY(lave_num != 0) {
+          GET_UNSTRUCT_INT8_V7_PARAM_TABLE(int8_t, int8_t)
+          // clang-format off
+        asm volatile(SPARSE_INT8_INT8_W48_v7_SEMI2_OUT
+          SET_ASSM_INPUT_PARAM2_V7_INT8_INT8
+          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12",
+            "q13", "q14", "q15", "r0", "r1", "cc", "memory");
+          // clang-format on
+        }
+      output =
+          reinterpret_cast<int8_t*>((uintptr_t)output + 48 * sizeof(int8_t));
+      B += 48;
+      mc -= 48 * sizeof(int8_t);
+    }
+  if
+    SPARSE_UNLIKELY(mc != 0) {
+      output_decrement += 16 * sizeof(int8_t);
+      if (mc & (32 * sizeof(int8_t))) {
+        LITE_PARALLEL_COMMON_BEGIN(i, tid, pair_num, 0, 1) {
+          GET_SEMI_INT8_V7_PARAM_TABLE(int8_t, int8_t)
+          float* bias_scale = vbias_scale;
+          if (bias) {
+            bias_scale[0] = bias[i * 2];
+            bias_scale[1] = bias_scale[0];
+            bias_scale[2] = bias_scale[0];
+            bias_scale[3] = bias_scale[0];
+          }
+          bias_scale[4] = scale[i * 2];
+          bias_scale[5] = bias_scale[4];
+          bias_scale[6] = bias_scale[4];
+          bias_scale[7] = bias_scale[4];
+          float* hs_param = vhs_param;
+          // clang-format off
+        asm volatile(SPARSE_INT8_INT8_W32_v7_SEMI2_OUT
+          SET_ASSM_INPUT_PARAM1_1_V7_INT8_INT8
+          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12",
+            "q13", "q14", "q15", "r0", "r1", "cc", "memory");
+          // clang-format on
+          bias_scale = vbias_scale;
+          if (bias) {
+            bias_scale[0] = bias[i * 2 + 1];
+            bias_scale[1] = bias_scale[0];
+            bias_scale[2] = bias_scale[0];
+            bias_scale[3] = bias_scale[0];
+          }
+          bias_scale[4] = scale[i * 2 + 1];
+          bias_scale[5] = bias_scale[4];
+          bias_scale[6] = bias_scale[4];
+          bias_scale[7] = bias_scale[4];
+          hs_param = vhs_param;
+          // clang-format off
+        asm volatile(SPARSE_INT8_INT8_W32_v7_SEMI2_OUT
+          SET_ASSM_INPUT_PARAM1_2_V7_INT8_INT8
+          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12",
+            "q13", "q14", "q15", "r0", "r1", "cc", "memory");
+          // clang-format on
+        }
+        LITE_PARALLEL_COMMON_END();
+        if
+          SPARSE_UNLIKELY(lave_num != 0) {
+            GET_UNSTRUCT_INT8_V7_PARAM_TABLE(int8_t, int8_t)
+            // clang-format off
+        asm volatile(SPARSE_INT8_INT8_W32_v7_SEMI2_OUT
+          SET_ASSM_INPUT_PARAM2_V7_INT8_INT8
+          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12",
+            "q13", "q14", "q15", "r0", "r1", "cc", "memory");
+            // clang-format on
+          }
+        output =
+            reinterpret_cast<int8_t*>((uintptr_t)output + 32 * sizeof(int8_t));
+        B += 32;
+        mc -= 32 * sizeof(int8_t);
+      }
+      output_decrement += 16 * sizeof(int8_t);
+      if (mc & (16 * sizeof(int8_t))) {
+        LITE_PARALLEL_COMMON_BEGIN(i, tid, pair_num, 0, 1) {
+          GET_SEMI_INT8_V7_PARAM_TABLE(int8_t, int8_t)
+          float* bias_scale = vbias_scale;
+          if (bias) {
+            bias_scale[0] = bias[i * 2];
+            bias_scale[1] = bias_scale[0];
+            bias_scale[2] = bias_scale[0];
+            bias_scale[3] = bias_scale[0];
+          }
+          bias_scale[4] = scale[i * 2];
+          bias_scale[5] = bias_scale[4];
+          bias_scale[6] = bias_scale[4];
+          bias_scale[7] = bias_scale[4];
+          float* hs_param = vhs_param;
+          // clang-format off
+        asm volatile(SPARSE_INT8_INT8_W16_v7_SEMI2_OUT
+          SET_ASSM_INPUT_PARAM1_1_V7_INT8_INT8
+          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12",
+            "q13", "r0", "r1", "cc", "memory");
+          // clang-format on
+          bias_scale = vbias_scale;
+          if (bias) {
+            bias_scale[0] = bias[i * 2 + 1];
+            bias_scale[1] = bias_scale[0];
+            bias_scale[2] = bias_scale[0];
+            bias_scale[3] = bias_scale[0];
+          }
+          bias_scale[4] = scale[i * 2 + 1];
+          bias_scale[5] = bias_scale[4];
+          bias_scale[6] = bias_scale[4];
+          bias_scale[7] = bias_scale[4];
+          hs_param = vhs_param;
+          // clang-format off
+        asm volatile(SPARSE_INT8_INT8_W16_v7_SEMI2_OUT
+          SET_ASSM_INPUT_PARAM1_2_V7_INT8_INT8
+          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12",
+            "q13", "r0", "r1", "cc", "memory");
+          // clang-format on
+        }
+        LITE_PARALLEL_COMMON_END();
+        if
+          SPARSE_UNLIKELY(lave_num != 0) {
+            GET_UNSTRUCT_INT8_V7_PARAM_TABLE(int8_t, int8_t)
+            // clang-format off
+        asm volatile(SPARSE_INT8_INT8_W16_v7_SEMI2_OUT
+          SET_ASSM_INPUT_PARAM2_V7_INT8_INT8
+          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12",
+            "q13", "r0", "r1", "cc", "memory");
+            // clang-format on
+          }
+        output =
+            reinterpret_cast<int8_t*>((uintptr_t)output + 16 * sizeof(int8_t));
+        B += 16;
+        mc -= 16 * sizeof(int8_t);
+      }
+      output_decrement += 8 * sizeof(int8_t);
+      if (mc & (8 * sizeof(int8_t))) {
+        LITE_PARALLEL_COMMON_BEGIN(i, tid, pair_num, 0, 1) {
+          GET_SEMI_INT8_V7_PARAM_TABLE(int8_t, int8_t)
+          float* bias_scale = vbias_scale;
+          if (bias) {
+            bias_scale[0] = bias[i * 2];
+            bias_scale[1] = bias_scale[0];
+            bias_scale[2] = bias_scale[0];
+            bias_scale[3] = bias_scale[0];
+          }
+          bias_scale[4] = scale[i * 2];
+          bias_scale[5] = bias_scale[4];
+          bias_scale[6] = bias_scale[4];
+          bias_scale[7] = bias_scale[4];
+          float* hs_param = vhs_param;
+          // clang-format off
+        asm volatile(SPARSE_INT8_INT8_W8_v7_SEMI2_OUT
+          SET_ASSM_INPUT_PARAM1_1_V7_INT8_INT8
+          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "r0", "r1", "cc", "memory");
+          // clang-format on
+          bias_scale = vbias_scale;
+          if (bias) {
+            bias_scale[0] = bias[i * 2 + 1];
+            bias_scale[1] = bias_scale[0];
+            bias_scale[2] = bias_scale[0];
+            bias_scale[3] = bias_scale[0];
+          }
+          bias_scale[4] = scale[i * 2 + 1];
+          bias_scale[5] = bias_scale[4];
+          bias_scale[6] = bias_scale[4];
+          bias_scale[7] = bias_scale[4];
+          hs_param = vhs_param;
+          // clang-format off
+        asm volatile(SPARSE_INT8_INT8_W8_v7_SEMI2_OUT
+          SET_ASSM_INPUT_PARAM1_2_V7_INT8_INT8
+          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "r0", "r1", "cc", "memory");
+          // clang-format on
+        }
+        LITE_PARALLEL_COMMON_END();
+        if
+          SPARSE_UNLIKELY(lave_num != 0) {
+            GET_UNSTRUCT_INT8_V7_PARAM_TABLE(int8_t, int8_t)
+            // clang-format off
+        asm volatile(SPARSE_INT8_INT8_W8_v7_SEMI2_OUT
+          SET_ASSM_INPUT_PARAM2_V7_INT8_INT8
+          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "r0", "r1", "cc", "memory");
+            // clang-format on
+          }
+        output =
+            reinterpret_cast<int8_t*>((uintptr_t)output + 8 * sizeof(int8_t));
+        B += 8;
+        mc -= 8 * sizeof(int8_t);
+      }
+      output_decrement += 4 * sizeof(int8_t);
+      if (mc & (4 * sizeof(int8_t))) {
+        LITE_PARALLEL_COMMON_BEGIN(i, tid, pair_num, 0, 1) {
+          GET_SEMI_INT8_V7_PARAM_TABLE(int8_t, int8_t)
+          float* bias_scale = vbias_scale;
+          if (bias) {
+            bias_scale[0] = bias[i * 2];
+            bias_scale[1] = bias_scale[0];
+            bias_scale[2] = bias_scale[0];
+            bias_scale[3] = bias_scale[0];
+          }
+          bias_scale[4] = scale[i * 2];
+          bias_scale[5] = bias_scale[4];
+          bias_scale[6] = bias_scale[4];
+          bias_scale[7] = bias_scale[4];
+          float* hs_param = vhs_param;
+          // clang-format off
+        asm volatile(SPARSE_INT8_INT8_W4_v7_SEMI2_OUT
+          SET_ASSM_INPUT_PARAM1_1_V7_INT8_INT8
+          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "r0", "r1", "cc", "memory");
+          // clang-format on
+          bias_scale = vbias_scale;
+          if (bias) {
+            bias_scale[0] = bias[i * 2 + 1];
+            bias_scale[1] = bias_scale[0];
+            bias_scale[2] = bias_scale[0];
+            bias_scale[3] = bias_scale[0];
+          }
+          bias_scale[4] = scale[i * 2 + 1];
+          bias_scale[5] = bias_scale[4];
+          bias_scale[6] = bias_scale[4];
+          bias_scale[7] = bias_scale[4];
+          hs_param = vhs_param;
+          // clang-format off
+        asm volatile(SPARSE_INT8_INT8_W4_v7_SEMI2_OUT
+          SET_ASSM_INPUT_PARAM1_2_V7_INT8_INT8
+          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "r0", "r1", "cc", "memory");
+          // clang-format on
+        }
+        LITE_PARALLEL_COMMON_END();
+        if
+          SPARSE_UNLIKELY(lave_num != 0) {
+            GET_UNSTRUCT_INT8_V7_PARAM_TABLE(int8_t, int8_t)
+            // clang-format off
+        asm volatile(SPARSE_INT8_INT8_W4_v7_SEMI2_OUT
+          SET_ASSM_INPUT_PARAM2_V7_INT8_INT8
+          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "r0", "r1", "cc", "memory");
+            // clang-format on
+          }
+        output =
+            reinterpret_cast<int8_t*>((uintptr_t)output + 4 * sizeof(int8_t));
+        B += 4;
+        mc -= 4 * sizeof(int8_t);
+      }
+
+      if
+        SPARSE_UNLIKELY(mc != 0 && mc < 4 * sizeof(int8_t)) {
+          LITE_PARALLEL_COMMON_BEGIN(i, tid, pair_num, 0, 1) {
+            int8_t* out_ptr1 = reinterpret_cast<int8_t*>((uintptr_t)output +
+                                                         output_stride * 2 * i);
+            int8_t* out_ptr2 = reinterpret_cast<int8_t*>(
+                (uintptr_t)output + output_stride * (2 * i + 1));
+            const int8_t* cur_w = A;
+            uint32_t nnz = nidx_nnzmap[i];
+            const int8_t* cur_b = B;
+            const int32_t* dmap = widx_dmap;
+            if (i != 0) {
+              cur_w = A + nidx_nnzmap[i - 1] * 2;
+              nnz = nidx_nnzmap[i] - nidx_nnzmap[i - 1];
+              cur_b +=
+                  ((nidx_nnzmap[i - 1] == 0)
+                       ? 0
+                       : widx_dmap[nidx_nnzmap[i - 1] - 1] / sizeof(int8_t));
+              dmap = widx_dmap + nidx_nnzmap[i - 1];
+            }
+
+            float32x4_t vbias1, vbias2;
+            if (bias != nullptr) {
+              vbias1 = vdupq_n_f32(bias[i * 2]);
+              vbias2 = vdupq_n_f32(bias[i * 2 + 1]);
+            } else {
+              vbias1 = vdupq_n_f32(0);
+              vbias2 = vdupq_n_f32(0);
+            }
+            const float* pscale = scale + i * 2;
+            int32x4_t vacc0123_1 = vdupq_n_s32(0);
+            int32x4_t vacc0123_2 = vdupq_n_s32(0);
+            for (int j = 0; j < nnz; j++) {
+              int8x8_t vi0123 = vdup_n_s8(0);
+              if (mc == 1) {
+                vi0123 = vld1_lane_s8(cur_b, vi0123, 0);
+              } else if (mc == 2) {
+                vi0123 = vld1_lane_s8(cur_b, vi0123, 0);
+                vi0123 = vld1_lane_s8(cur_b + 1, vi0123, 1);
+              } else {
+                vi0123 = vld1_lane_s8(cur_b, vi0123, 0);
+                vi0123 = vld1_lane_s8(cur_b + 1, vi0123, 1);
+                vi0123 = vld1_lane_s8(cur_b + 2, vi0123, 2);
+              }
+              int8x8_t vw1 = vld1_dup_s8(cur_w);
+              cur_w += 1;
+              int8x8_t vw2 = vld1_dup_s8(cur_w);
+              cur_w += 1;
+              intptr_t diff = *dmap++;
+              cur_b = (const int8_t*)((uintptr_t)cur_b + (uintptr_t)diff);
+              vacc0123_1 =
+                  vaddw_s16(vacc0123_1, vget_low_s16(vmull_s8(vi0123, vw1)));
+              vacc0123_2 =
+                  vaddw_s16(vacc0123_2, vget_low_s16(vmull_s8(vi0123, vw2)));
+            }
+            float32x4_t vaccf0123_1 = vcvtq_f32_s32(vacc0123_1);
+            float32x4_t vaccf0123_2 = vcvtq_f32_s32(vacc0123_2);
+            vaccf0123_1 = vmlaq_n_f32(vbias1, vaccf0123_1, pscale[0]);
+            vaccf0123_2 = vmlaq_n_f32(vbias2, vaccf0123_2, pscale[1]);
+            float32x4_t vzero = vdupq_n_f32(0);
+            if (flag_act == 1) {
+              vaccf0123_1 = vmaxq_f32(vaccf0123_1, vzero);
+              vaccf0123_2 = vmaxq_f32(vaccf0123_2, vzero);
+            } else if (flag_act == 0) {
+            } else if (flag_act == 2) {
+              float32x4_t aph = vdupq_n_f32(alpha);
+              vaccf0123_1 = vmaxq_f32(vaccf0123_1, vzero);
+              vaccf0123_1 = vminq_f32(vaccf0123_1, aph);
+              vaccf0123_2 = vmaxq_f32(vaccf0123_2, vzero);
+              vaccf0123_2 = vminq_f32(vaccf0123_2, aph);
+            } else if (flag_act == 3) {
+              float32x4_t aph = vdupq_n_f32(alpha);
+              uint32x4_t vflag0123_1 = vcgeq_f32(vaccf0123_1, vzero);
+              float32x4_t v0123_1 = vmulq_f32(vaccf0123_1, aph);
+              vaccf0123_1 = vbslq_f32(vflag0123_1, vaccf0123_1, v0123_1);
+              uint32x4_t vflag0123_2 = vcgeq_f32(vaccf0123_2, vzero);
+              float32x4_t v0123_2 = vmulq_f32(vaccf0123_2, aph);
+              vaccf0123_2 = vbslq_f32(vflag0123_2, vaccf0123_2, v0123_2);
+            } else {
+              float32x4_t offset =
+                  vdupq_n_f32(act_param.hard_swish_offset / param.output_scale);
+              float32x4_t hs_scale =
+                  vdupq_n_f32(1.0 / act_param.hard_swish_scale);
+              float32x4_t thre = vdupq_n_f32(act_param.hard_swish_threshold /
+                                             param.output_scale);
+              float32x4_t vset0123_1 = vaddq_f32(vaccf0123_1, offset);
+              float32x4_t vscale0123_1 = vmulq_f32(vaccf0123_1, hs_scale);
+              vset0123_1 = vmaxq_f32(vset0123_1, vzero);
+              vset0123_1 = vminq_f32(vset0123_1, thre);
+              vaccf0123_1 = vmulq_f32(vscale0123_1, vset0123_1);
+              float32x4_t vset0123_2 = vaddq_f32(vaccf0123_2, offset);
+              float32x4_t vscale0123_2 = vmulq_f32(vaccf0123_2, hs_scale);
+              vset0123_2 = vmaxq_f32(vset0123_2, vzero);
+              vset0123_2 = vminq_f32(vset0123_2, thre);
+              vaccf0123_2 = vmulq_f32(vscale0123_2, vset0123_2);
+            }
+            vaccf0123_1 = vbslq_f32(vcgeq_f32(vaccf0123_1, vdupq_n_f32(-127.0)),
+                                    vaccf0123_1,
+                                    vdupq_n_f32(-127.0));
+            vaccf0123_2 = vbslq_f32(vcgeq_f32(vaccf0123_2, vdupq_n_f32(-127.0)),
+                                    vaccf0123_2,
+                                    vdupq_n_f32(-127.0));
+            float32x4_t vpos = vdupq_n_f32(0.5);
+            float32x4_t vneg = vdupq_n_f32(-0.5);
+            vaccf0123_1 = vbslq_f32(vcgeq_f32(vaccf0123_1, vzero),
+                                    vaddq_f32(vaccf0123_1, vpos),
+                                    vaddq_f32(vaccf0123_1, vneg));
+            vaccf0123_2 = vbslq_f32(vcgeq_f32(vaccf0123_2, vzero),
+                                    vaddq_f32(vaccf0123_2, vpos),
+                                    vaddq_f32(vaccf0123_2, vneg));
+
+            int32x4_t vacci0123_1 = vcvtq_s32_f32(vaccf0123_1);
+            int16x4_t v16i0123_1 = vqmovn_s32(vacci0123_1);
+            int16x4_t v16i4567_1 = vdup_n_s16(0);
+            int8x8_t v8i01234567_1 =
+                vqmovn_s16(vcombine_s16(v16i0123_1, v16i4567_1));
+
+            int32x4_t vacci0123_2 = vcvtq_s32_f32(vaccf0123_2);
+            int16x4_t v16i0123_2 = vqmovn_s32(vacci0123_2);
+            int16x4_t v16i4567_2 = vdup_n_s16(0);
+            int8x8_t v8i01234567_2 =
+                vqmovn_s16(vcombine_s16(v16i0123_2, v16i4567_2));
+
+            if (mc == 1) {
+              vst1_lane_s8(out_ptr1, v8i01234567_1, 0);
+              vst1_lane_s8(out_ptr2, v8i01234567_2, 0);
+            } else if (mc == 2) {
+              vst1_lane_s8(out_ptr1, v8i01234567_1, 0);
+              vst1_lane_s8(out_ptr2, v8i01234567_2, 0);
+              vst1_lane_s8(out_ptr1 + 1, v8i01234567_1, 1);
+              vst1_lane_s8(out_ptr2 + 1, v8i01234567_2, 1);
+            } else {
+              vst1_lane_s8(out_ptr1, v8i01234567_1, 0);
+              vst1_lane_s8(out_ptr2, v8i01234567_2, 0);
+              vst1_lane_s8(out_ptr1 + 1, v8i01234567_1, 1);
+              vst1_lane_s8(out_ptr2 + 1, v8i01234567_2, 1);
+              vst1_lane_s8(out_ptr1 + 2, v8i01234567_1, 2);
+              vst1_lane_s8(out_ptr2 + 2, v8i01234567_2, 2);
+            }
+          }
+          LITE_PARALLEL_COMMON_END();
+          if
+            SPARSE_UNLIKELY(lave_num != 0) {
+              int8_t* out_ptr = reinterpret_cast<int8_t*>(
+                  (uintptr_t)output + output_stride * 2 * pair_num);
+              const int8_t* cur_w = A;
+              uint32_t nnz = nidx_nnzmap[pair_num];
+              const int8_t* cur_b = B;
+              const int32_t* dmap = widx_dmap;
+              if (pair_num != 0) {
+                cur_w = A + nidx_nnzmap[pair_num - 1] * 2;
+                nnz = nidx_nnzmap[pair_num] - nidx_nnzmap[pair_num - 1];
+                cur_b += ((nidx_nnzmap[pair_num - 1] == 0)
+                              ? 0
+                              : widx_dmap[nidx_nnzmap[pair_num - 1] - 1] /
+                                    sizeof(int8_t));
+                dmap = widx_dmap + nidx_nnzmap[pair_num - 1];
+              }
+
+              float32x4_t vbias = (bias != nullptr)
+                                      ? vdupq_n_f32(bias[pair_num * 2])
+                                      : vdupq_n_f32(0);
+              float vscale = scale[pair_num * 2];
+              int32x4_t vacc0123 = vdupq_n_s32(0);
+
+              for (int j = 0; j < nnz; j++) {
+                int8x8_t vi0123 = vdup_n_s8(0);
+                if (mc == 1) {
+                  vi0123 = vld1_lane_s8(cur_b, vi0123, 0);
+                } else if (mc == 2) {
+                  vi0123 = vld1_lane_s8(cur_b, vi0123, 0);
+                  vi0123 = vld1_lane_s8(cur_b + 1, vi0123, 1);
+                } else {
+                  vi0123 = vld1_lane_s8(cur_b, vi0123, 0);
+                  vi0123 = vld1_lane_s8(cur_b + 1, vi0123, 1);
+                  vi0123 = vld1_lane_s8(cur_b + 2, vi0123, 2);
+                }
+                int8x8_t vw = vld1_dup_s8(cur_w);
+                cur_w += 1;
+                intptr_t diff = *dmap++;
+                cur_b = (const int8_t*)((uintptr_t)cur_b + (uintptr_t)diff);
+                int16x8_t vo0123 = vmull_s8(vi0123, vw);
+                vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vo0123));
+              }
+              float32x4_t vaccf0123 = vcvtq_f32_s32(vacc0123);
+              vaccf0123 = vmlaq_n_f32(vbias, vaccf0123, vscale);
+              float32x4_t vzero = vdupq_n_f32(0);
+              if (flag_act == 1) {
+                vaccf0123 = vmaxq_f32(vaccf0123, vzero);
+              } else if (flag_act == 0) {
+              } else if (flag_act == 2) {
+                float32x4_t aph = vdupq_n_f32(alpha);
+                vaccf0123 = vmaxq_f32(vaccf0123, vzero);
+                vaccf0123 = vminq_f32(vaccf0123, aph);
+              } else if (flag_act == 3) {
+                float32x4_t aph = vdupq_n_f32(alpha);
+                uint32x4_t vflag0123 = vcgeq_f32(vaccf0123, vzero);
+                float32x4_t v0123 = vmulq_f32(vaccf0123, aph);
+                vaccf0123 = vbslq_f32(vflag0123, vaccf0123, v0123);
+              } else {
+                float32x4_t offset = vdupq_n_f32(act_param.hard_swish_offset /
+                                                 param.output_scale);
+                float32x4_t hs_scale =
+                    vdupq_n_f32(1.0 / act_param.hard_swish_scale);
+                float32x4_t thre = vdupq_n_f32(act_param.hard_swish_threshold /
+                                               param.output_scale);
+                float32x4_t vset0123 = vaddq_f32(vaccf0123, offset);
+                float32x4_t vscale0123 = vmulq_f32(vaccf0123, hs_scale);
+                vset0123 = vmaxq_f32(vset0123, vzero);
+                vset0123 = vminq_f32(vset0123, thre);
+                vaccf0123 = vmulq_f32(vscale0123, vset0123);
+              }
+              vaccf0123 = vbslq_f32(vcgeq_f32(vaccf0123, vdupq_n_f32(-127.0)),
+                                    vaccf0123,
+                                    vdupq_n_f32(-127.0));
+              float32x4_t vpos = vdupq_n_f32(0.5);
+              float32x4_t vneg = vdupq_n_f32(-0.5);
+              vaccf0123 = vbslq_f32(vcgeq_f32(vaccf0123, vzero),
+                                    vaddq_f32(vaccf0123, vpos),
+                                    vaddq_f32(vaccf0123, vneg));
+
+              int32x4_t vacci0123 = vcvtq_s32_f32(vaccf0123);
+
+              int16x4_t v16i0123 = vqmovn_s32(vacci0123);
+              int16x4_t v16i4567 = vdup_n_s16(0);
+              int8x8_t v8i01234567 =
+                  vqmovn_s16(vcombine_s16(v16i0123, v16i4567));
+
+              if (mc == 1) {
+                vst1_lane_s8(out_ptr, v8i01234567, 0);
+              } else if (mc == 2) {
+                vst1_lane_s8(out_ptr, v8i01234567, 0);
+                vst1_lane_s8(out_ptr + 1, v8i01234567, 1);
+              } else {
+                vst1_lane_s8(out_ptr, v8i01234567, 0);
+                vst1_lane_s8(out_ptr + 1, v8i01234567, 1);
+                vst1_lane_s8(out_ptr + 2, v8i01234567, 2);
+              }
+            }
+        }
+    }
+}
+
+#undef INIT_SEMI_CONV_PARAM
+#undef GET_SEMI_PARAM_TABLE
+#undef GET_UNSTRUCT_PARAM_TABLE
+#undef SET_ASSM_INPUT_PARAM1_V7_F32
+#undef SET_ASSM_INPUT_PARAM2_V7_F32
+#undef COMPUTE_ACT_NEON_TWO_V7_F32
+#undef COMPUTE_ACT_NEON_ONE_V7_F32
+#undef GET_SEMI_INT8_V7_PARAM_TABLE
+#undef SET_ASSM_INPUT_PARAM1_1_V7_INT8_F32
+#undef SET_ASSM_INPUT_PARAM1_2_V7_INT8_F32
+#undef SET_ASSM_INPUT_PARAM2_V7_INT8_F32
+#undef GET_UNSTRUCT_INT8_V7_PARAM_TABLE
+#undef SET_ASSM_INPUT_PARAM1_1_V7_INT8_INT8
+#undef SET_ASSM_INPUT_PARAM1_2_V7_INT8_INT8
+#undef SET_ASSM_INPUT_PARAM2_V7_INT8_INT8
+
+#endif
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/arm/math/sparse_semi_conv_impl.h b/lite/backends/arm/math/sparse_semi_conv_impl.h
new file mode 100644
index 00000000000..859477b01f4
--- /dev/null
+++ b/lite/backends/arm/math/sparse_semi_conv_impl.h
@@ -0,0 +1,73 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include "lite/backends/arm/math/sparse_conv_impl.h"
+#include "lite/core/context.h"
+#include "lite/core/target_wrapper.h"
+#include "lite/operators/op_params.h"
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+void sparse_semi_conv_fp32_pipelined(const float* A,
+                                     const float* B,
+                                     const int32_t* widx_dmap,
+                                     const uint32_t* nidx_nnzmap,
+                                     const float* bias,
+                                     float* output,
+                                     const int M,
+                                     const int K,
+                                     const int N,
+                                     const operators::SparseConvParam& param,
+                                     ARMContext* ctx);
+
+void sparse_semi_conv_int8_fp32_pipelined(
+    const int8_t* A,
+    const int8_t* B,
+    const int32_t* widx_dmap,
+    const uint32_t* nidx_nnzmap,
+    const float* bias,
+    const float* scale,
+    float* output,
+    int M,
+    int K,
+    int N,
+    const operators::SparseConvParam& param,
+    ARMContext* ctx);
+
+void sparse_semi_conv_int8_int8_pipelined(
+    const int8_t* A,
+    const int8_t* B,
+    const int32_t* widx_dmap,
+    const uint32_t* nidx_nnzmap,
+    const float* bias,
+    const float* scale,
+    int8_t* output,
+    int M,
+    int K,
+    int N,
+    const operators::SparseConvParam& param,
+    ARMContext* ctx);
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/optimizer/mir/sparse_conv_detect_pass.cc b/lite/core/optimizer/mir/sparse_conv_detect_pass.cc
index 273044bf8d4..4c9f7cb08e6 100644
--- a/lite/core/optimizer/mir/sparse_conv_detect_pass.cc
+++ b/lite/core/optimizer/mir/sparse_conv_detect_pass.cc
@@ -192,6 +192,180 @@ int SparseConvDetectPass::ComputeSparseWeight(
   return first_ic;
 }
 
+template <typename T>
+int SparseConvDetectPass::ComputeSemiSparseWeight(
+    const lite::Tensor* w_tensor,
+    const int M,
+    const int K,
+    const int N,
+    const int count_nonzeroes,
+    const int count_channels,
+    const int count_blocks,
+    lite::Tensor* nonzero_output_tensor,
+    lite::Tensor* oc_nonzeros_tensor,
+    lite::Tensor* diffs_tensor) {
+  const T* weights = w_tensor->data<T>();
+  T* nonzero_output = nonzero_output_tensor->mutable_data<T>();
+  auto* oc_nonzeros = oc_nonzeros_tensor->mutable_data<uint32_t>();
+  auto* diffs = diffs_tensor->mutable_data<int32_t>();
+  int align2 = M & (-2);
+  size_t output_channels_block_size = 2;
+  size_t first_ic = 0, last_ic = 0;
+  bool first_nonzero = true;
+  int nonzero_index = 0, diff_index = 0;
+  size_t block_index = 0, block_n = 0;
+  for (size_t ocb = 0; ocb < align2; ocb += output_channels_block_size) {
+    for (size_t ic = 0; ic < K; ic++) {
+      bool is_nonzero_block = false;
+      for (size_t oco = 0; oco < output_channels_block_size; oco++) {
+        is_nonzero_block |=
+            (weights[(ocb + oco) * K + ic] != static_cast<T>(0));
+      }
+      if (is_nonzero_block) {
+        for (size_t oco = 0; oco < output_channels_block_size; oco++) {
+          nonzero_output[nonzero_index++] = weights[(ocb + oco) * K + ic];
+        }
+        if (first_nonzero) {
+          first_ic = ic;
+        } else {
+          const int diff = (ic - last_ic) * sizeof(T);
+          diffs[diff_index++] = diff * N;
+        }
+        first_nonzero = false;
+        last_ic = ic;
+        oc_nonzeros[block_index] += 1;
+        block_n++;
+      }
+    }
+    oc_nonzeros[block_index++] = block_n;
+  }
+  for (size_t ocb = align2; ocb < M; ocb++) {
+    for (size_t ic = 0; ic < K; ic++) {
+      if (weights[ocb * K + ic] != static_cast<T>(0)) {
+        nonzero_output[nonzero_index++] = weights[ocb * K + ic];
+        if (first_nonzero) {
+          first_ic = ic;
+        } else {
+          const int diff = (ic - last_ic) * sizeof(T);
+          diffs[diff_index++] = diff * N;
+        }
+        first_nonzero = false;
+        last_ic = ic;
+        oc_nonzeros[block_index] += 1;
+        block_n++;
+      }
+    }
+    oc_nonzeros[block_index++] = block_n;
+  }
+  int tmp_diff = 0;
+  int tmp_ik = 0;
+  size_t block_i = 0;
+  for (size_t ocb = 0; ocb < align2; ocb += output_channels_block_size) {
+    if (block_i == 0) {
+      for (int ik = 0; ik < oc_nonzeros[block_i]; ik++) {
+        tmp_diff += diffs[tmp_ik++];
+      }
+    } else {
+      for (int ik = 0; ik < (oc_nonzeros[block_i] - oc_nonzeros[block_i - 1]);
+           ik++) {
+        tmp_diff += diffs[tmp_ik++];
+      }
+    }
+    if (tmp_ik != 0) {
+      diffs[tmp_ik - 1] = tmp_diff;
+    }
+    block_i++;
+  }
+  for (size_t ocb = align2; ocb < M; ocb++) {
+    if (block_i == 0) {
+      for (int ik = 0; ik < oc_nonzeros[block_i]; ik++) {
+        tmp_diff += diffs[tmp_ik++];
+      }
+    } else {
+      for (int ik = 0; ik < (oc_nonzeros[block_i] - oc_nonzeros[block_i - 1]);
+           ik++) {
+        tmp_diff += diffs[tmp_ik++];
+      }
+    }
+    if (tmp_ik != 0) {
+      diffs[tmp_ik - 1] = tmp_diff;
+    }
+    block_i++;
+  }
+  if (!first_nonzero) {
+    const int diff = (first_ic - last_ic) * sizeof(T);
+    diffs[diff_index++] = diff * N;
+  }
+  return first_ic;
+}
+
+template <typename T>
+int SparseConvDetectPass::ComputeSemiSparseZeros(const lite::Tensor* weights,
+                                                 int* count_nonzeroes,
+                                                 int* count_channels,
+                                                 int* count_blocks,
+                                                 int* flag_semi,
+                                                 const int height,
+                                                 const int width) {
+  const T* data = weights->data<T>();
+  int num_nonzeroes = 0;
+  int num_nonzero_blocks2 = 0;
+  int num_nonzero_blocks4 = 0;
+  int align4 = height & (-4);
+  int align2 = height & (-2);
+  for (size_t oc = 0; oc < align4; oc += 4) {
+    for (size_t ic = 0; ic < width; ic++) {
+      const size_t row0_nonzero =
+          static_cast<size_t>(data[oc * width + ic] != static_cast<T>(0));
+      const size_t row1_nonzero =
+          static_cast<size_t>(data[(oc + 1) * width + ic] != static_cast<T>(0));
+      const size_t row2_nonzero =
+          static_cast<size_t>(data[(oc + 2) * width + ic] != static_cast<T>(0));
+      const size_t row3_nonzero =
+          static_cast<size_t>(data[(oc + 3) * width + ic] != static_cast<T>(0));
+      num_nonzeroes +=
+          row0_nonzero + row1_nonzero + row2_nonzero + row3_nonzero;
+      num_nonzero_blocks2 +=
+          (row0_nonzero | row1_nonzero) + (row2_nonzero | row3_nonzero);
+      num_nonzero_blocks4 +=
+          (row0_nonzero | row1_nonzero | row2_nonzero | row3_nonzero);
+    }
+  }
+  for (size_t oc = align4; oc < align2; oc += 2) {
+    for (size_t ic = 0; ic < width; ic++) {
+      const size_t row0_nonzero =
+          static_cast<size_t>(data[oc * width + ic] != static_cast<T>(0));
+      const size_t row1_nonzero =
+          static_cast<size_t>(data[(oc + 1) * width + ic] != static_cast<T>(0));
+      num_nonzeroes += row0_nonzero + row1_nonzero;
+      num_nonzero_blocks2 += (row0_nonzero | row1_nonzero);
+    }
+  }
+  const size_t num_block2_nonzeroes = num_nonzeroes;
+  for (size_t oc = align2; oc < height; oc++) {
+    for (size_t ic = 0; ic < width; ic++) {
+      num_nonzeroes +=
+          static_cast<size_t>(data[oc * width + ic] != static_cast<T>(0));
+    }
+  }
+  *flag_semi = 0;
+  *count_channels = height;
+  *count_nonzeroes = num_nonzeroes;
+  *count_blocks = num_nonzeroes;
+  if (num_block2_nonzeroes * 5 >= num_nonzero_blocks2 * 9) {
+    // 2-channel blocks have 90%+ non-zeroes
+    *count_channels = (*count_channels) / 2 + (*count_channels) % 2;
+    // spmm_parameters = &xnn_params.f32.spmm2;
+    *flag_semi = 1;
+    // Non-zeroes which don't fit into whole 2-channel blocks, processed
+    // one-by-one
+    const size_t num_remaining_nonzeroes = num_nonzeroes - num_block2_nonzeroes;
+    *count_nonzeroes = num_nonzero_blocks2 * 2 + num_remaining_nonzeroes;
+    *count_blocks = num_nonzero_blocks2 + num_remaining_nonzeroes;
+  }
+  return height * width - (*count_nonzeroes);
+}
+
 /**
  * \brief Sparse representation of weights consists of three components:
  * @param w_tensor original dense weight data.
@@ -262,6 +436,46 @@ template int SparseConvDetectPass::ComputeSparseWeight<int8_t>(
     lite::Tensor* oc_nonzeros_tensor,
     lite::Tensor* diffs_tensor);
 
+/**
+ * \brief Semi-structured representation of weights consists of three
+ * components:
+ * @param w_tensor original dense weight data.
+ * @param count_nonzeroes the number of non-zero kernel elements.
+ * @param count_channels When semi-structured, it is equal to output channel / 2
+ * + output channel% 2
+ * @param count_blocks When semi-structured, the number of 2-channel blocks.
+ * @param nonzero_output_tensor An array of float values storing non-zero kernel
+ * elements.
+ * @param oc_nonzeros_tensor the number of non-zero kernel elements per each
+ * output channel.
+ * @param diffs_tensor An array of int32_t values storing scaled [by
+ * sizeof(input element)] difference
+ * between input channels corresponding to successive non-zero element.
+ */
+template int SparseConvDetectPass::ComputeSemiSparseWeight<int8_t>(
+    const lite::Tensor* w_tensor,
+    const int M,
+    const int K,
+    const int N,
+    const int count_nonzeroes,
+    const int count_channels,
+    const int count_blocks,
+    lite::Tensor* nonzero_output_tensor,
+    lite::Tensor* oc_nonzeros_tensor,
+    lite::Tensor* diffs_tensor);
+
+template int SparseConvDetectPass::ComputeSemiSparseWeight<float>(
+    const lite::Tensor* w_tensor,
+    const int M,
+    const int K,
+    const int N,
+    const int count_nonzeroes,
+    const int count_channels,
+    const int count_blocks,
+    lite::Tensor* nonzero_output_tensor,
+    lite::Tensor* oc_nonzeros_tensor,
+    lite::Tensor* diffs_tensor);
+
 template <typename T>
 int SparseConvDetectPass::ComputeSparseZeros(const lite::Tensor* weights,
                                              const int num) {
@@ -317,6 +531,24 @@ template int SparseConvDetectPass::ComputeSparseZeros<int8_t>(
     const int height,
     const int width);
 
+template int SparseConvDetectPass::ComputeSemiSparseZeros<int8_t>(
+    const lite::Tensor* weights,
+    int* count_nonzeroes,
+    int* count_channels,
+    int* count_blocks,
+    int* flag_semi,
+    const int height,
+    const int width);
+
+template int SparseConvDetectPass::ComputeSemiSparseZeros<float>(
+    const lite::Tensor* weights,
+    int* count_nonzeroes,
+    int* count_channels,
+    int* count_blocks,
+    int* flag_semi,
+    const int height,
+    const int width);
+
 void SparseConvDetectPass::CopyAttrFromOpInfo(cpp::OpDesc* op_desc,
                                               OpInfo* op_info,
                                               const std::string& attr_name) {
@@ -413,13 +645,36 @@ void SparseConvDetectPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
         VLOG(4) << "The paddings of the supported sparse conv must be 0";
         continue;
       }
+      if (!(ch_out > 0 && ch_in > 0)) {
+        VLOG(4) << "The input and output channels must be larger than 0";
+        continue;
+      }
       int zero_num;
       int num_build_nonzeroes = 0;
+      int count_nonzeroes = 0;
+      int count_channels = 0;
+      int count_blocks = 0;
+      int flag_semi = 0;
       if (use_fp32) {
-        zero_num = ComputeSparseZeros<float>(
-            &w_tensor, &num_build_nonzeroes, ch_out, ch_in);
+        zero_num = ComputeSemiSparseZeros<float>(&w_tensor,
+                                                 &count_nonzeroes,
+                                                 &count_channels,
+                                                 &count_blocks,
+                                                 &flag_semi,
+                                                 ch_out,
+                                                 ch_in);
+        if (flag_semi == 0) {
+          zero_num = ComputeSparseZeros<float>(
+              &w_tensor, &num_build_nonzeroes, ch_out, ch_in);
+        }
       } else if (use_int8) {
-        zero_num = ComputeSparseZeros<int8_t>(&w_tensor, weight_num);
+        zero_num = ComputeSemiSparseZeros<int8_t>(&w_tensor,
+                                                  &count_nonzeroes,
+                                                  &count_channels,
+                                                  &count_blocks,
+                                                  &flag_semi,
+                                                  ch_out,
+                                                  ch_in);
       }
       int nonzero_num = weight_num - zero_num;
       VLOG(4) << "zero_num: " << zero_num << "weight_num: " << weight_num;
@@ -451,34 +706,72 @@ void SparseConvDetectPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
       auto* oc_nonzeros_t = scope->Var(oc_nonzeros_name)->GetMutable<Tensor>();
       auto* ic_diffs_t = scope->Var(ic_diffs_name)->GetMutable<Tensor>();
       if (use_fp32) {
-        nonzeros_output_t->Resize({num_build_nonzeroes});
-        oc_nonzeros_t->Resize({ch_out});
-        ic_diffs_t->Resize({num_build_nonzeroes});
+        if (flag_semi == 1) {
+          nonzeros_output_t->Resize({count_nonzeroes});
+          oc_nonzeros_t->Resize({ch_out});
+          ic_diffs_t->Resize({count_blocks});
+        } else {
+          nonzeros_output_t->Resize({num_build_nonzeroes});
+          oc_nonzeros_t->Resize({ch_out});
+          ic_diffs_t->Resize({num_build_nonzeroes});
+        }
       } else if (use_int8) {
-        nonzeros_output_t->Resize({nonzero_num});
-        oc_nonzeros_t->Resize({ch_out});
-        ic_diffs_t->Resize({nonzero_num});
+        if (flag_semi == 1) {
+          nonzeros_output_t->Resize({count_nonzeroes});
+          oc_nonzeros_t->Resize({ch_out});
+          ic_diffs_t->Resize({count_blocks});
+        } else {
+          nonzeros_output_t->Resize({count_nonzeroes});
+          oc_nonzeros_t->Resize({ch_out});
+          ic_diffs_t->Resize({count_nonzeroes});
+        }
       }
       int first_ic;
       if (use_fp32) {
-        first_ic = ComputeSparseWeight<float>(&w_tensor,
-                                              ch_out,
-                                              ch_in,
-                                              im_size,
-                                              nonzero_num,
-                                              num_build_nonzeroes,
-                                              nonzeros_output_t,
-                                              oc_nonzeros_t,
-                                              ic_diffs_t);
+        if (flag_semi == 1) {
+          first_ic = ComputeSemiSparseWeight<float>(&w_tensor,
+                                                    ch_out,
+                                                    ch_in,
+                                                    im_size,
+                                                    count_nonzeroes,
+                                                    count_channels,
+                                                    count_blocks,
+                                                    nonzeros_output_t,
+                                                    oc_nonzeros_t,
+                                                    ic_diffs_t);
+        } else {
+          first_ic = ComputeSparseWeight<float>(&w_tensor,
+                                                ch_out,
+                                                ch_in,
+                                                im_size,
+                                                nonzero_num,
+                                                num_build_nonzeroes,
+                                                nonzeros_output_t,
+                                                oc_nonzeros_t,
+                                                ic_diffs_t);
+        }
       } else if (use_int8) {
-        first_ic = ComputeSparseWeight<int8_t>(&w_tensor,
-                                               ch_out,
-                                               ch_in,
-                                               im_size,
-                                               nonzero_num,
-                                               nonzeros_output_t,
-                                               oc_nonzeros_t,
-                                               ic_diffs_t);
+        if (flag_semi == 1) {
+          first_ic = ComputeSemiSparseWeight<int8_t>(&w_tensor,
+                                                     ch_out,
+                                                     ch_in,
+                                                     im_size,
+                                                     count_nonzeroes,
+                                                     count_channels,
+                                                     count_blocks,
+                                                     nonzeros_output_t,
+                                                     oc_nonzeros_t,
+                                                     ic_diffs_t);
+        } else {
+          first_ic = ComputeSparseWeight<int8_t>(&w_tensor,
+                                                 ch_out,
+                                                 ch_in,
+                                                 im_size,
+                                                 nonzero_num,
+                                                 nonzeros_output_t,
+                                                 oc_nonzeros_t,
+                                                 ic_diffs_t);
+        }
       }
       VLOG(4) << "zero_num: " << zero_num << " weight_num: " << weight_num
               << " first_ic: " << first_ic;
@@ -526,6 +819,7 @@ void SparseConvDetectPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
       }
 
       op_desc.SetAttr<int>("first_ic", first_ic);
+      op_desc.SetAttr<int>("flag_semi", flag_semi);
       sparse_conv2d_op->Attach(op_desc, node->stmt()->op()->scope());
       auto* sparse_op_node = graph->GraphCreateInstructNode(
           sparse_conv2d_op, graph->valid_places());
diff --git a/lite/core/optimizer/mir/sparse_conv_detect_pass.h b/lite/core/optimizer/mir/sparse_conv_detect_pass.h
index 3b451653787..2288e604afa 100644
--- a/lite/core/optimizer/mir/sparse_conv_detect_pass.h
+++ b/lite/core/optimizer/mir/sparse_conv_detect_pass.h
@@ -36,6 +36,15 @@ class SparseConvDetectPass : public ProgramPass {
                          const int height,
                          const int width);
 
+  template <typename T>
+  int ComputeSemiSparseZeros(const lite::Tensor* weights,
+                             int* count_nonzeroes,
+                             int* count_channels,
+                             int* count_blocks,
+                             int* flag_semi,
+                             const int height,
+                             const int width);
+
   template <typename T>
   int ComputeSparseWeight(const lite::Tensor* w_tensor,
                           const int M,
@@ -45,6 +54,19 @@ class SparseConvDetectPass : public ProgramPass {
                           lite::Tensor* nonzero_output_tensor,
                           lite::Tensor* oc_nonzeros_tensor,
                           lite::Tensor* diffs_tensor);
+
+  template <typename T>
+  int ComputeSemiSparseWeight(const lite::Tensor* w_tensor,
+                              const int M,
+                              const int K,
+                              const int N,
+                              const int count_nonzeroes,
+                              const int count_channels,
+                              const int count_blocks,
+                              lite::Tensor* nonzero_output_tensor,
+                              lite::Tensor* oc_nonzeros_tensor,
+                              lite::Tensor* diffs_tensor);
+
   template <typename T>
   int ComputeSparseWeight(const lite::Tensor* w_tensor,
                           const int M,
diff --git a/lite/kernels/arm/sparse_conv_compute.cc b/lite/kernels/arm/sparse_conv_compute.cc
index f3c0a34c517..75f1533b98e 100644
--- a/lite/kernels/arm/sparse_conv_compute.cc
+++ b/lite/kernels/arm/sparse_conv_compute.cc
@@ -15,6 +15,7 @@
 #include "lite/kernels/arm/sparse_conv_compute.h"
 #include <utility>
 #include "lite/backends/arm/math/sparse_conv_impl.h"
+#include "lite/backends/arm/math/sparse_semi_conv_impl.h"
 #include "lite/core/op_registry.h"
 #include "lite/core/type_system.h"
 
@@ -108,18 +109,33 @@ void SparseConvCompute<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
   int oc = o_dims[1];
   int im_size = oh * ow;
   int first_ic = param.first_ic;
+  int flag_semi = param.flag_semi;
   const float* din = input + first_ic * im_size;
-  lite::arm::math::sparse_conv_fp32_pipelined(nonzero_weights,
-                                              din,
-                                              diffs,
-                                              oc_nonzeros,
-                                              bias,
-                                              dout,
-                                              oc,
-                                              ic,
-                                              im_size,
-                                              param,
-                                              &ctx);
+  if (flag_semi == 1) {
+    lite::arm::math::sparse_semi_conv_fp32_pipelined(nonzero_weights,
+                                                     din,
+                                                     diffs,
+                                                     oc_nonzeros,
+                                                     bias,
+                                                     dout,
+                                                     oc,
+                                                     ic,
+                                                     im_size,
+                                                     param,
+                                                     &ctx);
+  } else {
+    lite::arm::math::sparse_conv_fp32_pipelined(nonzero_weights,
+                                                din,
+                                                diffs,
+                                                oc_nonzeros,
+                                                bias,
+                                                dout,
+                                                oc,
+                                                ic,
+                                                im_size,
+                                                param,
+                                                &ctx);
+  }
   KERNEL_FUNC_NAME("sparse_conv_fp32_pipelined")
 }
 
@@ -148,19 +164,35 @@ void SparseConvCompute<PRECISION(kInt8), PRECISION(kFloat)>::Run() {
   int oc = o_dims[1];
   int im_size = oh * ow;
   int first_ic = param.first_ic;
+  int flag_semi = param.flag_semi;
   auto* din = input + first_ic * im_size;
-  lite::arm::math::sparse_conv_int8_fp32_pipelined(nonzero_weights,
-                                                   din,
-                                                   diffs,
-                                                   oc_nonzeros,
-                                                   bias,
-                                                   w_scale_.data(),
-                                                   dout,
-                                                   oc,
-                                                   ic,
-                                                   im_size,
-                                                   param,
-                                                   &ctx);
+  if (flag_semi == 1) {
+    lite::arm::math::sparse_semi_conv_int8_fp32_pipelined(nonzero_weights,
+                                                          din,
+                                                          diffs,
+                                                          oc_nonzeros,
+                                                          bias,
+                                                          w_scale_.data(),
+                                                          dout,
+                                                          oc,
+                                                          ic,
+                                                          im_size,
+                                                          param,
+                                                          &ctx);
+  } else {
+    lite::arm::math::sparse_conv_int8_fp32_pipelined(nonzero_weights,
+                                                     din,
+                                                     diffs,
+                                                     oc_nonzeros,
+                                                     bias,
+                                                     w_scale_.data(),
+                                                     dout,
+                                                     oc,
+                                                     ic,
+                                                     im_size,
+                                                     param,
+                                                     &ctx);
+  }
   KERNEL_FUNC_NAME("sparse_conv_int8_fp32_pipelined")
 }
 
@@ -189,19 +221,35 @@ void SparseConvCompute<PRECISION(kInt8), PRECISION(kInt8)>::Run() {
   int oc = o_dims[1];
   int im_size = oh * ow;
   int first_ic = param.first_ic;
+  int flag_semi = param.flag_semi;
   auto* din = input + first_ic * im_size;
-  lite::arm::math::sparse_conv_int8_int8_pipelined(nonzero_weights,
-                                                   din,
-                                                   diffs,
-                                                   oc_nonzeros,
-                                                   bias,
-                                                   w_scale_.data(),
-                                                   dout,
-                                                   oc,
-                                                   ic,
-                                                   im_size,
-                                                   param,
-                                                   &ctx);
+  if (flag_semi == 1) {
+    lite::arm::math::sparse_semi_conv_int8_int8_pipelined(nonzero_weights,
+                                                          din,
+                                                          diffs,
+                                                          oc_nonzeros,
+                                                          bias,
+                                                          w_scale_.data(),
+                                                          dout,
+                                                          oc,
+                                                          ic,
+                                                          im_size,
+                                                          param,
+                                                          &ctx);
+  } else {
+    lite::arm::math::sparse_conv_int8_int8_pipelined(nonzero_weights,
+                                                     din,
+                                                     diffs,
+                                                     oc_nonzeros,
+                                                     bias,
+                                                     w_scale_.data(),
+                                                     dout,
+                                                     oc,
+                                                     ic,
+                                                     im_size,
+                                                     param,
+                                                     &ctx);
+  }
   KERNEL_FUNC_NAME("sparse_conv_int8_int8_pipelined")
 }
 
diff --git a/lite/operators/op_params.h b/lite/operators/op_params.h
index 4a130a2d316..a3a421e45e8 100644
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
@@ -336,6 +336,7 @@ struct SparseConvParam : ParamBase {
   const lite::Tensor* bias{nullptr};
   lite::Tensor* output{};
   int first_ic{0};
+  int flag_semi{0};
   std::vector<int> strides{1, 1};
   std::shared_ptr<std::vector<int>> paddings;
   int groups{1};
diff --git a/lite/operators/sparse_conv_op.h b/lite/operators/sparse_conv_op.h
index 0c31510479c..6c8aa90277f 100644
--- a/lite/operators/sparse_conv_op.h
+++ b/lite/operators/sparse_conv_op.h
@@ -120,6 +120,9 @@ class SparseConvOp : public OpLite {
     if (op_desc.HasAttr("first_ic")) {
       param_.first_ic = op_desc.GetAttr<int>("first_ic");
     }
+    if (op_desc.HasAttr("flag_semi")) {
+      param_.flag_semi = op_desc.GetAttr<int>("flag_semi");
+    }
 
     // For Int8
     const OpInfo* op_info = static_cast<const OpInfo*>(&op_desc);