Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/llama-arch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_QWEN3NEXT, "qwen3next" },
{ LLM_ARCH_QWEN3VL, "qwen3vl" },
{ LLM_ARCH_QWEN3VLMOE, "qwen3vlmoe" },
{ LLM_ARCH_QWEN35MOE, "qwen35moe" },
{ LLM_ARCH_PHI2, "phi2" },
{ LLM_ARCH_PHI3, "phi3" },
{ LLM_ARCH_PLAMO, "plamo" },
Expand Down Expand Up @@ -159,6 +160,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_INDEXER_HEAD_COUNT, "%s.attention.indexer.head_count" },
{ LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, "%s.attention.indexer.key_length" },
{ LLM_KV_ATTENTION_INDEXER_TOP_K, "%s.attention.indexer.top_k" },
{ LLM_KV_FULL_ATTENTION_INTERVAL, "%s.full_attention_interval" },


{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
Expand Down
5 changes: 4 additions & 1 deletion src/llama-arch.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ enum llm_arch {
LLM_ARCH_QWEN3NEXT,
LLM_ARCH_QWEN3VL,
LLM_ARCH_QWEN3VLMOE,
LLM_ARCH_QWEN35MOE,
LLM_ARCH_PHI2,
LLM_ARCH_PHI3,
LLM_ARCH_PLAMO,
Expand Down Expand Up @@ -152,7 +153,7 @@ enum llm_kv {
LLM_KV_ATTENTION_INDEXER_HEAD_COUNT,
LLM_KV_ATTENTION_INDEXER_KEY_LENGTH,
LLM_KV_ATTENTION_INDEXER_TOP_K,

LLM_KV_FULL_ATTENTION_INTERVAL,

LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_DIMENSION_COUNT_PER_LAYER,
Expand Down Expand Up @@ -285,6 +286,8 @@ enum llm_tensor {
LLM_TENSOR_SSM_NORM,
LLM_TENSOR_SSM_OUT,
LLM_TENSOR_SSM_BETA_ALPHA,
LLM_TENSOR_SSM_ALPHA,
LLM_TENSOR_SSM_BETA,
LLM_TENSOR_ATTN_Q_A,
LLM_TENSOR_ATTN_Q_B,
LLM_TENSOR_ATTN_KV_A_MQA,
Expand Down
145 changes: 143 additions & 2 deletions src/llama-build-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ ggml_cgraph * llm_build_context::build_k_shift() {
ggml_set_input(lctx.inp_K_shift);

for (int il = 0; il < n_layer; ++il) {
if (model.arch == LLM_ARCH_QWEN3NEXT && hparams.is_recurrent(il)) {
if ((model.arch == LLM_ARCH_QWEN3NEXT || model.arch == LLM_ARCH_QWEN35MOE) && hparams.is_recurrent(il)) {
continue;
}
if (kv_self.k_l[il] == nullptr) {
Expand Down Expand Up @@ -241,7 +241,7 @@ ggml_cgraph * llm_build_context::build_defrag(const std::vector<uint32_t> & ids)
}

for (int il = 0; il < n_layer; ++il) {
if (model.arch == LLM_ARCH_QWEN3NEXT && hparams.is_recurrent(il)) {
if ((model.arch == LLM_ARCH_QWEN3NEXT || model.arch == LLM_ARCH_QWEN35MOE) && hparams.is_recurrent(il)) {
continue;
}
if (kv_self.k_l[il] == nullptr) {
Expand Down Expand Up @@ -4478,6 +4478,143 @@ ggml_cgraph * llm_build_context::build_qwen3next() {
return gf;
}

ggml_cgraph * llm_build_context::build_qwen35moe() {
static constexpr int QWEN3NEXT_CHUNK_SIZE = 64;

struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);

delta_net delta(lctx, batch);

const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);

int sections[4];
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);

auto build_layer_attn = [&](ggml_tensor * cur, ggml_tensor * inp_pos, ggml_tensor * KQ_mask, int il) -> ggml_tensor * {

auto Qaux = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
auto Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
auto Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
cb(Qaux, "Qaux", il);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
ggml_build_forward_expand(gf, Qaux);
ggml_build_forward_expand(gf, Kcur);
ggml_build_forward_expand(gf, Vcur);

Qaux = ggml_reshape_3d(ctx0, Qaux, n_embd_head * 2, n_head, n_tokens);
auto Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, Qaux, n_embd_head, n_head, n_tokens, Qaux->nb[1], Qaux->nb[2], 0));
auto gate = ggml_cont_2d(ctx0, ggml_view_3d(ctx0, Qaux, n_embd_head, n_head, n_tokens, Qaux->nb[1], Qaux->nb[2], n_embd_head*ggml_element_size(Qaux)), n_embd_head*n_head, n_tokens);
cb(Qcur, "Qcur", il);
cb(gate, "gate", il);

Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);

Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, cb, il);
cb(Qcur, "Qcur_normed", il);

Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, nullptr, LLM_NORM_RMS, cb, il);
cb(Kcur, "Kcur_normed", il);

Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr,
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);

Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr,
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);

cb(Qcur, "Qcur_roped", il);
cb(Kcur, "Kcur_roped", il);

ggml_tensor * attn = llm_build_kv(ctx0, lctx, kv_self, gf, nullptr, nullptr,
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv,
hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale, cb, il);
cb(attn, "attn_pregate", il);

gate = ggml_sigmoid(ctx0, gate);
cb(gate, "gate_sigmoid", il);
attn = ggml_mul(ctx0, attn, gate);
cb(attn, "attn_gated", il);

attn = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, attn);
cb(attn, "attn_output", il);

return attn;

};

ggml_tensor * inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
ggml_tensor * inp_pos = build_inp_pos();
ggml_tensor * inp_out_ids = n_tokens > 1 ? build_inp_out_ids() : nullptr;
ggml_tensor * KQ_mask = build_inp_KQ_mask();

lctx.inp_s_seq_qnext = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, 1, n_tokens);
cb(lctx.inp_s_seq_qnext, "inp_s_seq_qnext", -1);
ggml_set_input(lctx.inp_s_seq_qnext);

ggml_tensor * causal_mask = nullptr;
ggml_tensor * identity = nullptr;
ggml_tensor * diag_mask = nullptr;
causal_mask = ggml_tri(ctx0,
ggml_fill_inplace(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, QWEN3NEXT_CHUNK_SIZE, QWEN3NEXT_CHUNK_SIZE), 1.0f),
GGML_TRI_TYPE_LOWER);
identity = ggml_diag(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, QWEN3NEXT_CHUNK_SIZE), 1.0f));
diag_mask = ggml_add(ctx0, causal_mask, identity);
ggml_build_forward_expand(gf, causal_mask);
ggml_build_forward_expand(gf, identity);
ggml_build_forward_expand(gf, diag_mask);

ggml_tensor * cur = nullptr;

for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;

cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, cb, il);
cb(cur, "attn_norm", il);

if (hparams.is_recurrent(il)) {
cur = delta.build_layer_attn_linear(ctx0, gf, cur, causal_mask, identity, diag_mask, il, cb);
} else {
cur = build_layer_attn(cur, inp_pos, KQ_mask, il);
}

if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}

cur = ggml_add(ctx0, cur, inpSA);
cb(cur, "attn_residual", il);

cur = llm_build_std_moe_ffn(ctx0, lctx, model.layers[il].ffn_norm, cur,
model.layers[il].ffn_gate_inp, nullptr,
model.layers[il].ffn_up_exps, nullptr,
model.layers[il].ffn_gate_exps, nullptr,
model.layers[il].ffn_down_exps, nullptr,
nullptr,
model.layers[il].ffn_up_shexp, nullptr, // we don't have shared expert biases?
model.layers[il].ffn_gate_shexp, nullptr,
model.layers[il].ffn_down_shexp, nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, true, false, 0.0f,
LLM_EXPERT_GATING_FUNC_SOFTMAX,
LLM_FFN_SILU, cb, il, gf, true, model.layers[il].ffn_up_gate_exps, nullptr, model.layers[il].ffn_gate_inp_shexp);

cur = lctx.cvec.apply_to(ctx0, cur, il);
cb(cur, "l_out", il);

inpL = cur;
}

cur = build_output(lctx, ctx0, inpL, model.output, model.output_norm, cb);
cb(cur, "result_output", -1);

ggml_build_forward_expand(gf, cur);

return gf;
}

ggml_cgraph * llm_build_context::build_qwen3vl() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);

Expand Down Expand Up @@ -9508,6 +9645,10 @@ ggml_cgraph * llm_build_context::llama_build_graph(
{
result = llm.build_qwen3next();
} break;
case LLM_ARCH_QWEN35MOE:
{
result = llm.build_qwen35moe();
} break;
case LLM_ARCH_QWEN3VL:
{
result = llm.build_qwen3vl();
Expand Down
2 changes: 2 additions & 0 deletions src/llama-build-context.h
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,8 @@ struct llm_build_context {

ggml_cgraph * build_qwen3next();

ggml_cgraph * build_qwen35moe();

ggml_cgraph * build_phi2();

ggml_cgraph * build_phi3();
Expand Down
84 changes: 49 additions & 35 deletions src/llama-delta-net.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,9 @@ std::pair<ggml_tensor *, ggml_tensor *> delta_net::build_delta_net_chunking(ggml
GGML_ASSERT(v->ne[2] == n_tokens);
GGML_ASSERT(k->ne[2] == n_tokens);
GGML_ASSERT(g->ne[0] == H_v && g->ne[1] == n_tokens && g->ne[2] == n_seqs);
if (beta->ne[0] != H_v || beta->ne[2] != n_tokens || beta->ne[3] != n_seqs) {
printf("beta: %ld x %ld x %ld, expected %ld x %ld x %ld\n", beta->ne[0], beta->ne[2], beta->ne[3], H_v, n_tokens, n_seqs);
}
GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs);
GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_v && state->ne[2] == H_v && state->ne[3] == n_seqs);
GGML_ASSERT(H_k == H_v);
Expand Down Expand Up @@ -320,10 +323,6 @@ std::pair<ggml_tensor *, ggml_tensor *> delta_net::build_delta_net_autoregressiv
GGML_ASSERT(H_k == H_v);
GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_v && state->ne[2] == H_v && state->ne[3] == n_seqs);

//const float eps_norm = hparams.f_norm_rms_eps;
//q = ggml_l2_norm(ctx0, q, eps_norm);
//k = ggml_l2_norm(ctx0, k, eps_norm);

const float scale = 1.0f / sqrtf(S_v);

q = ggml_scale(ctx0, q, scale);
Expand Down Expand Up @@ -464,35 +463,45 @@ ggml_tensor * delta_net::build_layer_attn_linear_core(ggml_context * ctx0, ggml_
const uint32_t qnext_state_slots = llm_build_context::llama_kv_qnext_state_slots(kv_self);
GGML_ASSERT(qnext_state_slots > 0);


const int64_t n_tok = cur->ne[1];
const int64_t n_seqs = 1;
const int64_t n_seq_tokens = n_tok;

auto qkvz = build_qkvz(ctx0, cur, il, cb);
ggml_tensor * qkv_mixed = qkvz.first;
ggml_tensor * z = qkvz.second;

ggml_tensor * mixed_ba = llm_build_context::llm_build_lora_mm(lctx, ctx0, model.layers[il].ssm_beta_alpha, cur);
cb(mixed_ba, "linear_attn_mixed_ba", il);

int64_t ba_new_dim = 2 * num_v_heads / num_k_heads;
ggml_tensor * mixed_ba_reshaped = ggml_reshape_4d(ctx0, mixed_ba, ba_new_dim, num_k_heads, n_tok, 1);

int64_t split_sizes_ba[2] = {
num_v_heads / num_k_heads,
num_v_heads / num_k_heads
};

ggml_tensor * b = ggml_view_4d(ctx0, mixed_ba_reshaped, split_sizes_ba[0], num_k_heads, n_tok, 1,
mixed_ba_reshaped->nb[1], mixed_ba_reshaped->nb[2], mixed_ba_reshaped->nb[3], 0);
cb(b, "b", il);

ggml_tensor * a = ggml_view_4d(ctx0, mixed_ba_reshaped, split_sizes_ba[1], num_k_heads, n_tok, 1,
mixed_ba_reshaped->nb[1], mixed_ba_reshaped->nb[2], mixed_ba_reshaped->nb[3],
split_sizes_ba[0] * ggml_element_size(mixed_ba_reshaped));
cb(a, "a", il);

ggml_tensor * beta = ggml_cont_4d(ctx0, b, num_v_heads, 1, n_tok, 1);
ggml_tensor * alpha = ggml_cont_3d(ctx0, a, num_v_heads, n_tok, 1);
ggml_tensor *alpha, *beta;
if (model.layers[il].ssm_beta_alpha) {
ggml_tensor * mixed_ba = llm_build_context::llm_build_lora_mm(lctx, ctx0, model.layers[il].ssm_beta_alpha, cur);
cb(mixed_ba, "linear_attn_mixed_ba", il);

int64_t ba_new_dim = 2 * num_v_heads / num_k_heads;
ggml_tensor * mixed_ba_reshaped = ggml_reshape_4d(ctx0, mixed_ba, ba_new_dim, num_k_heads, n_tok, 1);

int64_t split_sizes_ba[2] = {
num_v_heads / num_k_heads,
num_v_heads / num_k_heads
};

ggml_tensor * b = ggml_view_4d(ctx0, mixed_ba_reshaped, split_sizes_ba[0], num_k_heads, n_tok, 1,
mixed_ba_reshaped->nb[1], mixed_ba_reshaped->nb[2], mixed_ba_reshaped->nb[3], 0);
cb(b, "b", il);

ggml_tensor * a = ggml_view_4d(ctx0, mixed_ba_reshaped, split_sizes_ba[1], num_k_heads, n_tok, 1,
mixed_ba_reshaped->nb[1], mixed_ba_reshaped->nb[2], mixed_ba_reshaped->nb[3],
split_sizes_ba[0] * ggml_element_size(mixed_ba_reshaped));
cb(a, "a", il);

beta = ggml_cont_4d(ctx0, b, num_v_heads, 1, n_tok, 1);
alpha = ggml_cont_3d(ctx0, a, num_v_heads, n_tok, 1);
} else {
beta = llm_build_context::llm_build_lora_mm(lctx, ctx0, model.layers[il].ssm_beta, cur);
beta = ggml_reshape_4d(ctx0, beta, num_v_heads, 1, n_tok, 1);
alpha = llm_build_context::llm_build_lora_mm(lctx, ctx0, model.layers[il].ssm_alpha, cur);
// Why???
alpha = ggml_cont_3d(ctx0, alpha, num_v_heads, n_seq_tokens, n_seqs);
}
cb(beta, "beta", il);
cb(alpha, "alpha", il);

Expand Down Expand Up @@ -569,16 +578,21 @@ ggml_tensor * delta_net::build_layer_attn_linear_core(ggml_context * ctx0, ggml_

if (num_k_heads != num_v_heads) {
GGML_ASSERT(num_v_heads % num_k_heads == 0);
const int64_t repeat_factor = num_v_heads / num_k_heads;
if (model.layers[il].ssm_beta_alpha) {
const int64_t repeat_factor = num_v_heads / num_k_heads;

ggml_tensor * q_reshaped = ggml_reshape_3d(ctx0, q_conv, head_k_dim, 1, num_k_heads * n_tok);
ggml_tensor * k_reshaped = ggml_reshape_3d(ctx0, k_conv, head_k_dim, 1, num_k_heads * n_tok);
ggml_tensor * q_reshaped = ggml_reshape_3d(ctx0, q_conv, head_k_dim, 1, num_k_heads * n_tok);
ggml_tensor * k_reshaped = ggml_reshape_3d(ctx0, k_conv, head_k_dim, 1, num_k_heads * n_tok);

ggml_tensor * q_repeated = ggml_repeat_4d(ctx0, q_reshaped, head_k_dim, repeat_factor, num_k_heads * n_tok, 1);
ggml_tensor * k_repeated = ggml_repeat_4d(ctx0, k_reshaped, head_k_dim, repeat_factor, num_k_heads * n_tok, 1);
ggml_tensor * q_repeated = ggml_repeat_4d(ctx0, q_reshaped, head_k_dim, repeat_factor, num_k_heads * n_tok, 1);
ggml_tensor * k_repeated = ggml_repeat_4d(ctx0, k_reshaped, head_k_dim, repeat_factor, num_k_heads * n_tok, 1);

q_conv = ggml_reshape_4d(ctx0, q_repeated, head_k_dim, num_k_heads * repeat_factor, n_tok, 1);
k_conv = ggml_reshape_4d(ctx0, k_repeated, head_k_dim, num_k_heads * repeat_factor, n_tok, 1);
q_conv = ggml_reshape_4d(ctx0, q_repeated, head_k_dim, num_k_heads * repeat_factor, n_tok, 1);
k_conv = ggml_reshape_4d(ctx0, k_repeated, head_k_dim, num_k_heads * repeat_factor, n_tok, 1);
} else {
q_conv = ggml_repeat_4d(ctx0, q_conv, head_k_dim, num_v_heads, n_seq_tokens, n_seqs);
k_conv = ggml_repeat_4d(ctx0, k_conv, head_k_dim, num_v_heads, n_seq_tokens, n_seqs);
}
}

cb(q_conv, "q_conv_predelta", il);
Expand Down Expand Up @@ -645,7 +659,7 @@ ggml_tensor * delta_net::build_layer_attn_linear(ggml_context * ctx0, ggml_cgrap
GGML_ASSERT(model.layers[il].ssm_conv1d != nullptr);
GGML_ASSERT(model.layers[il].ssm_dt != nullptr);
GGML_ASSERT(model.layers[il].ssm_a != nullptr);
GGML_ASSERT(model.layers[il].ssm_beta_alpha != nullptr);
GGML_ASSERT(model.layers[il].ssm_beta_alpha != nullptr || (model.layers[il].ssm_alpha != nullptr && model.layers[il].ssm_beta != nullptr));
GGML_ASSERT(model.layers[il].ssm_norm != nullptr);
GGML_ASSERT(model.layers[il].ssm_out != nullptr);
GGML_ASSERT(model.layers[il].wqkv != nullptr || model.layers[il].ssm_in != nullptr);
Expand Down
Loading