From 00e2d64a9dce99e8eab8e859fb9b8e5eb8a51bec Mon Sep 17 00:00:00 2001 From: Zhang Xiangze Date: Wed, 17 Sep 2025 02:05:17 +0000 Subject: [PATCH 1/3] [CPU]Improve dynamic 4bit moe performance - Avoid tensor concat - Use silu_and_mul kernel Signed-off-by: Zhang Xiangze --- csrc/moe/dynamic_4bit_int_moe_cpu.cpp | 43 +++++++++++---------------- 1 file changed, 18 insertions(+), 25 deletions(-) diff --git a/csrc/moe/dynamic_4bit_int_moe_cpu.cpp b/csrc/moe/dynamic_4bit_int_moe_cpu.cpp index 1d06fc6b5b0a..bdc893bb93be 100644 --- a/csrc/moe/dynamic_4bit_int_moe_cpu.cpp +++ b/csrc/moe/dynamic_4bit_int_moe_cpu.cpp @@ -21,6 +21,8 @@ inline torch::Tensor mm(const torch::Tensor& a, const torch::Tensor& packed_w, #endif } +extern void silu_and_mul(torch::Tensor& out, torch::Tensor& input); + enum ActivationKind : int64_t { SwiGLU_Gu = 0, // act = SiLU(g) * u SwiGLUOAI = 1, // act = SiLU(u) * g @@ -87,30 +89,23 @@ torch::Tensor dynamic_4bit_int_moe_cpu( const int64_t g_eff_13 = (group_size != -1) ? group_size : H; const int64_t g_eff_2 = (group_size != -1) ? group_size : I; - // Per-expert outputs filled in parallel - std::vector y_list(E); - y_list.resize(E); + auto X_all = x_c.index_select(/*dim=*/0, expert_tokens); + if (apply_router_weight_on_input) { + X_all = X_all.mul(expert_gates.unsqueeze(1)); + } + auto Y_all = at::empty({offsets[E], H}, x_c.options()); at::parallel_for(0, E, 1, [&](int64_t e_begin, int64_t e_end) { + c10::InferenceMode guard; for (int64_t e = e_begin; e < e_end; ++e) { const int64_t te = counts[e]; if (te == 0) { - y_list[e] = at::empty({0, H}, x_c.options()); continue; } const int64_t start = offsets[e]; - auto sel_tokens = - expert_tokens.narrow(/*dim=*/0, /*start=*/start, /*length=*/te); - auto gates_e = - expert_gates.narrow(/*dim=*/0, /*start=*/start, /*length=*/te); - - auto x_e = x_c.index_select(/*dim=*/0, sel_tokens); - - if (apply_router_weight_on_input) { - x_e = x_e.mul(gates_e.unsqueeze(1)); - } + auto x_e = X_all.narrow(/*dim=*/0, /*start=*/start, /*length=*/te); auto w13_e = w13_packed.select(/*dim=*/0, e); auto w2_e = w2_packed.select(/*dim=*/0, e); @@ -119,11 +114,10 @@ torch::Tensor dynamic_4bit_int_moe_cpu( auto y13 = mm(x_e, w13_e, g_eff_13, /*in_features=*/H, /*out_features=*/I2); - auto g_part = y13.narrow(/*dim=*/1, /*start=*/0, /*length=*/I); - auto u_part = y13.narrow(/*dim=*/1, /*start=*/I, /*length=*/I); - torch::Tensor act; if (activation_kind == ActivationKind::SwiGLUOAI) { // SwiGLUOAI + auto g_part = y13.narrow(/*dim=*/1, /*start=*/0, /*length=*/I); + auto u_part = y13.narrow(/*dim=*/1, /*start=*/I, /*length=*/I); constexpr double kAlpha = 1.702; // GPT-OSS default constexpr double kLimit = 7.0; // GPT-OSS default auto gate_c = at::clamp_max(g_part, kLimit); @@ -131,23 +125,22 @@ torch::Tensor dynamic_4bit_int_moe_cpu( auto glu = gate_c.mul(at::sigmoid(gate_c.mul(kAlpha))); act = up_c.add(1.0).mul(glu); } else { // SiLU , SwiGLU_GU, vLLM maps silu to SiluAndMul() - act = at::silu(g_part).mul(u_part); + act = at::empty({te, I}, y13.options()); + silu_and_mul(act, y13); } // W2 auto y = mm(act, w2_e, g_eff_2, /*in_features=*/I, /*out_features=*/H); - if (!apply_router_weight_on_input) { - y = y.mul(gates_e.unsqueeze(1)); - } - // Store per-expert result - y_list[e] = y; + Y_all.narrow(/*dim=*/0, /*start=*/start, /*length=*/te).copy_(y); } }); - // Concatenate all expert outputs to match expert_tokens order - auto Y_all = at::cat(y_list, /*dim=*/0); + if (!apply_router_weight_on_input) { + Y_all = Y_all.mul(expert_gates.unsqueeze(1)); + } + auto out = at::zeros({T, H}, x.options()); out = at::index_add(out, /*dim=*/0, /*index=*/expert_tokens, /*source=*/Y_all); From 863aca801cc2d60dde395323c3f9a0ac962bfedf Mon Sep 17 00:00:00 2001 From: Zhang Xiangze Date: Wed, 22 Oct 2025 03:34:20 +0000 Subject: [PATCH 2/3] Fix pre-commit Signed-off-by: Zhang Xiangze --- csrc/moe/dynamic_4bit_int_moe_cpu.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/csrc/moe/dynamic_4bit_int_moe_cpu.cpp b/csrc/moe/dynamic_4bit_int_moe_cpu.cpp index bdc893bb93be..0b2c524b2cb2 100644 --- a/csrc/moe/dynamic_4bit_int_moe_cpu.cpp +++ b/csrc/moe/dynamic_4bit_int_moe_cpu.cpp @@ -118,8 +118,8 @@ torch::Tensor dynamic_4bit_int_moe_cpu( if (activation_kind == ActivationKind::SwiGLUOAI) { // SwiGLUOAI auto g_part = y13.narrow(/*dim=*/1, /*start=*/0, /*length=*/I); auto u_part = y13.narrow(/*dim=*/1, /*start=*/I, /*length=*/I); - constexpr double kAlpha = 1.702; // GPT-OSS default - constexpr double kLimit = 7.0; // GPT-OSS default + constexpr double kAlpha = 1.702; // GPT-OSS default + constexpr double kLimit = 7.0; // GPT-OSS default auto gate_c = at::clamp_max(g_part, kLimit); auto up_c = at::clamp(u_part, -kLimit, kLimit); auto glu = gate_c.mul(at::sigmoid(gate_c.mul(kAlpha))); From c083ca48dcb1d9c4cca20a3a23cba997d087485c Mon Sep 17 00:00:00 2001 From: Zhang Xiangze Date: Mon, 3 Nov 2025 04:34:47 +0000 Subject: [PATCH 3/3] Revert silu_and_mul change Signed-off-by: Zhang Xiangze --- csrc/moe/dynamic_4bit_int_moe_cpu.cpp | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/csrc/moe/dynamic_4bit_int_moe_cpu.cpp b/csrc/moe/dynamic_4bit_int_moe_cpu.cpp index 0b2c524b2cb2..df47bb8dd1d7 100644 --- a/csrc/moe/dynamic_4bit_int_moe_cpu.cpp +++ b/csrc/moe/dynamic_4bit_int_moe_cpu.cpp @@ -21,8 +21,6 @@ inline torch::Tensor mm(const torch::Tensor& a, const torch::Tensor& packed_w, #endif } -extern void silu_and_mul(torch::Tensor& out, torch::Tensor& input); - enum ActivationKind : int64_t { SwiGLU_Gu = 0, // act = SiLU(g) * u SwiGLUOAI = 1, // act = SiLU(u) * g @@ -114,19 +112,19 @@ torch::Tensor dynamic_4bit_int_moe_cpu( auto y13 = mm(x_e, w13_e, g_eff_13, /*in_features=*/H, /*out_features=*/I2); + auto g_part = y13.narrow(/*dim=*/1, /*start=*/0, /*length=*/I); + auto u_part = y13.narrow(/*dim=*/1, /*start=*/I, /*length=*/I); + torch::Tensor act; if (activation_kind == ActivationKind::SwiGLUOAI) { // SwiGLUOAI - auto g_part = y13.narrow(/*dim=*/1, /*start=*/0, /*length=*/I); - auto u_part = y13.narrow(/*dim=*/1, /*start=*/I, /*length=*/I); - constexpr double kAlpha = 1.702; // GPT-OSS default - constexpr double kLimit = 7.0; // GPT-OSS default + constexpr double kAlpha = 1.702; // GPT-OSS default + constexpr double kLimit = 7.0; // GPT-OSS default auto gate_c = at::clamp_max(g_part, kLimit); auto up_c = at::clamp(u_part, -kLimit, kLimit); auto glu = gate_c.mul(at::sigmoid(gate_c.mul(kAlpha))); act = up_c.add(1.0).mul(glu); } else { // SiLU , SwiGLU_GU, vLLM maps silu to SiluAndMul() - act = at::empty({te, I}, y13.options()); - silu_and_mul(act, y13); + act = at::silu(g_part).mul(u_part); } // W2