Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1687,6 +1687,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.yarn_beta_fast = std::stof(value);
}
).set_env("LLAMA_ARG_YARN_BETA_FAST"));
add_opt(common_arg(
{"--moe-n-expert"}, "N",
string_format("MoE: override number of active experts (default: %d = model default)\n"
"for MoE self-draft speculation, use 1 for draft context", params.moe_n_expert_override),
[](common_params & params, int value) {
params.moe_n_expert_override = value;
}
).set_env("LLAMA_ARG_MOE_N_EXPERT"));
add_opt(common_arg(
{"-gan", "--grp-attn-n"}, "N",
string_format("group-attention factor (default: %d)", params.grp_attn_n),
Expand Down
1 change: 1 addition & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1328,6 +1328,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
cparams.yarn_beta_fast = params.yarn_beta_fast;
cparams.yarn_beta_slow = params.yarn_beta_slow;
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
cparams.moe_n_expert_override = params.moe_n_expert_override;
cparams.pooling_type = params.pooling_type;
cparams.attention_type = params.attention_type;
cparams.flash_attn_type = params.flash_attn_type;
Expand Down
1 change: 1 addition & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,7 @@ struct common_params {
float yarn_beta_fast = -1.0f; // YaRN low correction dim
float yarn_beta_slow = -1.0f; // YaRN high correction dim
int32_t yarn_orig_ctx = 0; // YaRN original context length
int32_t moe_n_expert_override = 0; // MoE self-draft: override n_expert_used (0 = use model default)

// offload params
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
Expand Down
5 changes: 5 additions & 0 deletions include/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,11 @@ extern "C" {
uint32_t yarn_orig_ctx; // YaRN original context size
float defrag_thold; // [DEPRECATED] defragment the KV cache if holes/size > thold, <= 0 disabled (default)

// MoE self-drafting: override n_expert_used for this context
// 0 = use model default, 1+ = force exactly N active experts
// Used for MoE self-draft speculation: draft context uses n=1, verify uses full
int32_t moe_n_expert_override;

ggml_backend_sched_eval_callback cb_eval;
void * cb_eval_user_data;

Expand Down
2 changes: 2 additions & 0 deletions src/llama-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ llama_context::llama_context(

cparams.op_offload = params.op_offload;
cparams.kv_unified = params.kv_unified;
cparams.moe_n_expert_override = params.moe_n_expert_override;

{
const char * LLAMA_GRAPH_REUSE_DISABLE = getenv("LLAMA_GRAPH_REUSE_DISABLE");
Expand Down Expand Up @@ -2303,6 +2304,7 @@ llama_context_params llama_context_default_params() {
/*.yarn_beta_slow =*/ -1.0f,
/*.yarn_orig_ctx =*/ 0,
/*.defrag_thold =*/ -1.0f,
/*.moe_n_expert_override =*/ 0,
/*.cb_eval =*/ nullptr,
/*.cb_eval_user_data =*/ nullptr,
/*.type_k =*/ GGML_TYPE_F16,
Expand Down
4 changes: 4 additions & 0 deletions src/llama-cparams.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@ struct llama_cparams {
bool op_offload;
bool kv_unified;

// MoE self-drafting: override n_expert_used
// 0 = use model default, 1+ = force exactly N active experts
int32_t moe_n_expert_override;

enum llama_pooling_type pooling_type;

ggml_backend_sched_eval_callback cb_eval;
Expand Down
57 changes: 42 additions & 15 deletions src/llama-graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -995,16 +995,38 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
ggml_tensor * weights = ggml_get_rows(ctx0, probs, selected_experts); // [1, n_expert_used, n_tokens]
cb(weights, "ffn_moe_weights", il);

// HARD MASK: If moe_n_expert_override is set, slice tensors to only use first N experts
// This actually reduces computation by only loading/computing N experts instead of all n_expert_used
// Unlike soft mask (which zeros weights but still computes all experts), hard mask skips the computation entirely
int32_t n_expert_exec = n_expert_used; // Default: execute all selected experts
if (cparams.moe_n_expert_override > 0 && cparams.moe_n_expert_override < n_expert_used) {
n_expert_exec = cparams.moe_n_expert_override;

// Slice selected_experts from [n_expert_used, n_tokens] to [n_expert_exec, n_tokens]
// This causes ggml_mul_mat_id to only load and compute the first n_expert_exec experts
selected_experts = ggml_view_2d(ctx0, selected_experts, n_expert_exec, n_tokens,
selected_experts->nb[1], 0);
// Make contiguous for subsequent operations
selected_experts = ggml_cont(ctx0, selected_experts);
cb(selected_experts, "ffn_moe_topk_sliced", il);

// Slice weights from [1, n_expert_used, n_tokens] to [1, n_expert_exec, n_tokens]
weights = ggml_view_3d(ctx0, weights, 1, n_expert_exec, n_tokens,
weights->nb[1], weights->nb[2], 0);
// Make contiguous for subsequent reshape operations
weights = ggml_cont(ctx0, weights);
cb(weights, "ffn_moe_weights_sliced", il);
}

if (gating_op == LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT) {
weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens);
weights = ggml_soft_max(ctx0, weights); // [n_expert_used, n_tokens]
weights = ggml_reshape_3d(ctx0, weights, 1, n_expert_used, n_tokens);
weights = ggml_reshape_2d(ctx0, weights, n_expert_exec, n_tokens);
weights = ggml_soft_max(ctx0, weights); // [n_expert_exec, n_tokens]
weights = ggml_reshape_3d(ctx0, weights, 1, n_expert_exec, n_tokens);
cb(weights, "ffn_moe_weights_softmax", il);
}

if (norm_w) {
weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens);
weights = ggml_reshape_2d(ctx0, weights, n_expert_exec, n_tokens);

ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); // [1, n_tokens]
cb(weights_sum, "ffn_moe_weights_sum", il);
Expand All @@ -1013,10 +1035,10 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
weights_sum = ggml_clamp(ctx0, weights_sum, 6.103515625e-5, INFINITY);
cb(weights_sum, "ffn_moe_weights_sum_clamped", il);

weights = ggml_div(ctx0, weights, weights_sum); // [n_expert_used, n_tokens]
weights = ggml_div(ctx0, weights, weights_sum); // [n_expert_exec, n_tokens]
cb(weights, "ffn_moe_weights_norm", il);

weights = ggml_reshape_3d(ctx0, weights, 1, n_expert_used, n_tokens);
weights = ggml_reshape_3d(ctx0, weights, 1, n_expert_exec, n_tokens);
}
if (scale_w) {
weights = ggml_scale(ctx0, weights, w_scale);
Expand All @@ -1029,8 +1051,8 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);

if (weight_before_ffn) {
// repeat cur to [n_embd, n_expert_used, n_tokens]
ggml_tensor * repeated = ggml_repeat_4d(ctx0, cur, n_embd, n_expert_used, n_tokens, 1);
// repeat cur to [n_embd, n_expert_exec, n_tokens]
ggml_tensor * repeated = ggml_repeat_4d(ctx0, cur, n_embd, n_expert_exec, n_tokens, 1);
cur = ggml_mul(ctx0, repeated, weights);
cb(cur, "ffn_moe_weighted", il);
}
Expand Down Expand Up @@ -1108,26 +1130,31 @@ ggml_tensor * llm_graph_context::build_moe_ffn(

ggml_tensor * cur_experts[LLAMA_MAX_EXPERTS] = { nullptr };

assert(n_expert_used > 0);
// Determine actual expert count for aggregation
// When --moe-n-expert is set (hard mask mode), use n_expert_exec
// Otherwise use hparams.n_expert_used to avoid dynamic allocation issues during warmup
// ref: https://github.com/ggml-org/llama.cpp/pull/14753
const uint32_t n_expert_agg = (cparams.moe_n_expert_override > 0)
? (uint32_t)n_expert_exec
: hparams.n_expert_used;

assert(n_expert_agg > 0);

// order the views before the adds
for (uint32_t i = 0; i < hparams.n_expert_used; ++i) {
for (uint32_t i = 0; i < n_expert_agg; ++i) {
cur_experts[i] = ggml_view_2d(ctx0, experts, n_embd, n_tokens, experts->nb[2], i*experts->nb[1]);

ggml_build_forward_expand(gf, cur_experts[i]);
}

// aggregate experts
// note: here we explicitly use hparams.n_expert_used instead of n_expert_used
// to avoid potentially a large number of add nodes during warmup
// ref: https://github.com/ggml-org/llama.cpp/pull/14753
ggml_tensor * moe_out = cur_experts[0];

for (uint32_t i = 1; i < hparams.n_expert_used; ++i) {
for (uint32_t i = 1; i < n_expert_agg; ++i) {
moe_out = ggml_add(ctx0, moe_out, cur_experts[i]);
}

if (hparams.n_expert_used == 1) {
if (n_expert_agg == 1) {
// avoid returning a non-contiguous tensor
moe_out = ggml_cont(ctx0, moe_out);
}
Expand Down