Skip to content

Commit 364c77f

Browse files
committed
talk-llama : sync llama.cpp
1 parent 83f2ed1 commit 364c77f

19 files changed

+2597
-555
lines changed

examples/talk-llama/llama-arch.cpp

Lines changed: 116 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
3737
{ LLM_ARCH_QWEN3NEXT, "qwen3next" },
3838
{ LLM_ARCH_QWEN3VL, "qwen3vl" },
3939
{ LLM_ARCH_QWEN3VLMOE, "qwen3vlmoe" },
40+
{ LLM_ARCH_QWEN35, "qwen35" },
41+
{ LLM_ARCH_QWEN35MOE, "qwen35moe" },
4042
{ LLM_ARCH_PHI2, "phi2" },
4143
{ LLM_ARCH_PHI3, "phi3" },
4244
{ LLM_ARCH_PHIMOE, "phimoe" },
@@ -72,6 +74,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
7274
{ LLM_ARCH_CHATGLM, "chatglm" },
7375
{ LLM_ARCH_GLM4, "glm4" },
7476
{ LLM_ARCH_GLM4_MOE, "glm4moe" },
77+
{ LLM_ARCH_GLM_DSA, "glm-dsa" },
7578
{ LLM_ARCH_BITNET, "bitnet" },
7679
{ LLM_ARCH_T5, "t5" },
7780
{ LLM_ARCH_T5ENCODER, "t5encoder" },
@@ -195,6 +198,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
195198
{ LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" },
196199
{ LLM_KV_TOKEN_SHIFT_COUNT, "%s.token_shift_count" },
197200
{ LLM_KV_INTERLEAVE_MOE_LAYER_STEP, "%s.interleave_moe_layer_step" },
201+
{ LLM_KV_FULL_ATTENTION_INTERVAL, "%s.full_attention_interval" },
198202

199203
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
200204
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -222,6 +226,9 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
222226
{ LLM_KV_ATTENTION_TEMPERATURE_SCALE, "%s.attention.temperature_scale" },
223227
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
224228
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
229+
{ LLM_KV_ATTENTION_INDEXER_HEAD_COUNT, "%s.attention.indexer.head_count" },
230+
{ LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, "%s.attention.indexer.key_length" },
231+
{ LLM_KV_ATTENTION_INDEXER_TOP_K, "%s.attention.indexer.top_k" },
225232

226233
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
227234
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
@@ -366,6 +373,7 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
366373
{ LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
367374
{ LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
368375
{ LLM_TENSOR_SSM_BETA_ALPHA, "blk.%d.ssm_ba" },
376+
{ LLM_TENSOR_SSM_ALPHA, "blk.%d.ssm_alpha" },
369377
{ LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" },
370378
{ LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" },
371379
{ LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
@@ -512,6 +520,10 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
512520
{ LLM_TENSOR_VISEXP_FFN_GATE, "blk.%d.vis_gate" },
513521
{ LLM_TENSOR_VISEXP_FFN_DOWN, "blk.%d.vis_down" },
514522
{ LLM_TENSOR_VISEXP_FFN_UP, "blk.%d.vis_up" },
523+
{ LLM_TENSOR_INDEXER_K_NORM, "blk.%d.indexer.k_norm" },
524+
{ LLM_TENSOR_INDEXER_PROJ, "blk.%d.indexer.proj" },
525+
{ LLM_TENSOR_INDEXER_ATTN_K, "blk.%d.indexer.attn_k" },
526+
{ LLM_TENSOR_INDEXER_ATTN_Q_B, "blk.%d.indexer.attn_q_b" },
515527
};
516528

517529
static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
@@ -968,7 +980,6 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
968980
LLM_TENSOR_ATTN_OUT,
969981
LLM_TENSOR_ATTN_QKV,
970982
LLM_TENSOR_ATTN_GATE,
971-
LLM_TENSOR_FFN_NORM,
972983
LLM_TENSOR_FFN_GATE_INP,
973984
LLM_TENSOR_FFN_GATE_EXPS,
974985
LLM_TENSOR_FFN_DOWN_EXPS,
@@ -985,6 +996,63 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
985996
LLM_TENSOR_SSM_NORM,
986997
LLM_TENSOR_SSM_OUT,
987998
};
999+
case LLM_ARCH_QWEN35:
1000+
return {
1001+
LLM_TENSOR_TOKEN_EMBD,
1002+
LLM_TENSOR_OUTPUT_NORM,
1003+
LLM_TENSOR_OUTPUT,
1004+
LLM_TENSOR_ATTN_NORM,
1005+
LLM_TENSOR_ATTN_POST_NORM,
1006+
LLM_TENSOR_ATTN_Q,
1007+
LLM_TENSOR_ATTN_Q_NORM,
1008+
LLM_TENSOR_ATTN_K,
1009+
LLM_TENSOR_ATTN_K_NORM,
1010+
LLM_TENSOR_ATTN_V,
1011+
LLM_TENSOR_ATTN_OUT,
1012+
LLM_TENSOR_ATTN_QKV,
1013+
LLM_TENSOR_ATTN_GATE,
1014+
LLM_TENSOR_FFN_GATE,
1015+
LLM_TENSOR_FFN_DOWN,
1016+
LLM_TENSOR_FFN_UP,
1017+
LLM_TENSOR_SSM_A_NOSCAN,
1018+
LLM_TENSOR_SSM_CONV1D,
1019+
LLM_TENSOR_SSM_DT,
1020+
LLM_TENSOR_SSM_BETA,
1021+
LLM_TENSOR_SSM_ALPHA,
1022+
LLM_TENSOR_SSM_NORM,
1023+
LLM_TENSOR_SSM_OUT,
1024+
};
1025+
case LLM_ARCH_QWEN35MOE:
1026+
return {
1027+
LLM_TENSOR_TOKEN_EMBD,
1028+
LLM_TENSOR_OUTPUT_NORM,
1029+
LLM_TENSOR_OUTPUT,
1030+
LLM_TENSOR_ATTN_NORM,
1031+
LLM_TENSOR_ATTN_POST_NORM,
1032+
LLM_TENSOR_ATTN_Q,
1033+
LLM_TENSOR_ATTN_Q_NORM,
1034+
LLM_TENSOR_ATTN_K,
1035+
LLM_TENSOR_ATTN_K_NORM,
1036+
LLM_TENSOR_ATTN_V,
1037+
LLM_TENSOR_ATTN_OUT,
1038+
LLM_TENSOR_ATTN_QKV,
1039+
LLM_TENSOR_ATTN_GATE,
1040+
LLM_TENSOR_FFN_GATE_INP,
1041+
LLM_TENSOR_FFN_GATE_EXPS,
1042+
LLM_TENSOR_FFN_DOWN_EXPS,
1043+
LLM_TENSOR_FFN_UP_EXPS,
1044+
LLM_TENSOR_FFN_GATE_INP_SHEXP,
1045+
LLM_TENSOR_FFN_GATE_SHEXP,
1046+
LLM_TENSOR_FFN_DOWN_SHEXP,
1047+
LLM_TENSOR_FFN_UP_SHEXP,
1048+
LLM_TENSOR_SSM_A_NOSCAN,
1049+
LLM_TENSOR_SSM_CONV1D,
1050+
LLM_TENSOR_SSM_DT,
1051+
LLM_TENSOR_SSM_BETA,
1052+
LLM_TENSOR_SSM_ALPHA,
1053+
LLM_TENSOR_SSM_NORM,
1054+
LLM_TENSOR_SSM_OUT,
1055+
};
9881056
case LLM_ARCH_QWEN3VL:
9891057
case LLM_ARCH_CHAMELEON:
9901058
case LLM_ARCH_HUNYUAN_DENSE:
@@ -1597,6 +1665,46 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
15971665
LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,
15981666
LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,
15991667
};
1668+
case LLM_ARCH_GLM_DSA:
1669+
return {
1670+
LLM_TENSOR_TOKEN_EMBD,
1671+
LLM_TENSOR_OUTPUT_NORM,
1672+
LLM_TENSOR_OUTPUT,
1673+
LLM_TENSOR_ATTN_NORM,
1674+
LLM_TENSOR_ATTN_Q_A_NORM,
1675+
LLM_TENSOR_ATTN_KV_A_NORM,
1676+
LLM_TENSOR_ATTN_Q,
1677+
LLM_TENSOR_ATTN_Q_A,
1678+
LLM_TENSOR_ATTN_Q_B,
1679+
LLM_TENSOR_ATTN_KV_A_MQA,
1680+
LLM_TENSOR_ATTN_KV_B,
1681+
LLM_TENSOR_ATTN_K_B,
1682+
LLM_TENSOR_ATTN_V_B,
1683+
LLM_TENSOR_ATTN_OUT,
1684+
LLM_TENSOR_FFN_NORM,
1685+
LLM_TENSOR_FFN_GATE,
1686+
LLM_TENSOR_FFN_UP,
1687+
LLM_TENSOR_FFN_DOWN,
1688+
LLM_TENSOR_FFN_GATE_INP,
1689+
LLM_TENSOR_FFN_GATE_EXPS,
1690+
LLM_TENSOR_FFN_DOWN_EXPS,
1691+
LLM_TENSOR_FFN_UP_EXPS,
1692+
LLM_TENSOR_FFN_GATE_INP_SHEXP,
1693+
LLM_TENSOR_FFN_GATE_SHEXP,
1694+
LLM_TENSOR_FFN_DOWN_SHEXP,
1695+
LLM_TENSOR_FFN_UP_SHEXP,
1696+
LLM_TENSOR_FFN_EXP_PROBS_B,
1697+
LLM_TENSOR_INDEXER_K_NORM,
1698+
LLM_TENSOR_INDEXER_PROJ,
1699+
LLM_TENSOR_INDEXER_ATTN_K,
1700+
LLM_TENSOR_INDEXER_ATTN_Q_B,
1701+
LLM_TENSOR_NEXTN_EH_PROJ,
1702+
LLM_TENSOR_NEXTN_EMBED_TOKENS,
1703+
LLM_TENSOR_NEXTN_ENORM,
1704+
LLM_TENSOR_NEXTN_HNORM,
1705+
LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,
1706+
LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,
1707+
};
16001708
case LLM_ARCH_BITNET:
16011709
return {
16021710
LLM_TENSOR_TOKEN_EMBD,
@@ -2456,6 +2564,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
24562564
{LLM_TENSOR_SSM_X, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
24572565
{LLM_TENSOR_SSM_DT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
24582566
{LLM_TENSOR_SSM_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2567+
{LLM_TENSOR_SSM_ALPHA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
24592568
{LLM_TENSOR_SSM_BETA_ALPHA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
24602569
{LLM_TENSOR_TIME_MIX_W1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
24612570
{LLM_TENSOR_TIME_MIX_W2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
@@ -2582,6 +2691,10 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
25822691
{LLM_TENSOR_VISEXP_FFN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
25832692
{LLM_TENSOR_VISEXP_FFN_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
25842693
{LLM_TENSOR_VISEXP_FFN_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2694+
{LLM_TENSOR_INDEXER_K_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
2695+
{LLM_TENSOR_INDEXER_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2696+
{LLM_TENSOR_INDEXER_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2697+
{LLM_TENSOR_INDEXER_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
25852698
// NextN/MTP tensors are currently ignored (reserved for future MTP support)
25862699
// These tensors only exist in the last layer(s) and are treated as output tensors
25872700
{LLM_TENSOR_NEXTN_EH_PROJ, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
@@ -2675,6 +2788,8 @@ bool llm_arch_is_hybrid(const llm_arch & arch) {
26752788
case LLM_ARCH_NEMOTRON_H_MOE:
26762789
case LLM_ARCH_QWEN3NEXT:
26772790
case LLM_ARCH_KIMI_LINEAR:
2791+
case LLM_ARCH_QWEN35:
2792+
case LLM_ARCH_QWEN35MOE:
26782793
return true;
26792794
default:
26802795
return false;

examples/talk-llama/llama-arch.h

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@ enum llm_arch {
4141
LLM_ARCH_QWEN3NEXT,
4242
LLM_ARCH_QWEN3VL,
4343
LLM_ARCH_QWEN3VLMOE,
44+
LLM_ARCH_QWEN35,
45+
LLM_ARCH_QWEN35MOE,
4446
LLM_ARCH_PHI2,
4547
LLM_ARCH_PHI3,
4648
LLM_ARCH_PHIMOE,
@@ -76,6 +78,7 @@ enum llm_arch {
7678
LLM_ARCH_CHATGLM,
7779
LLM_ARCH_GLM4,
7880
LLM_ARCH_GLM4_MOE,
81+
LLM_ARCH_GLM_DSA,
7982
LLM_ARCH_BITNET,
8083
LLM_ARCH_T5,
8184
LLM_ARCH_T5ENCODER,
@@ -199,6 +202,7 @@ enum llm_kv {
199202
LLM_KV_EMBEDDING_SCALE,
200203
LLM_KV_TOKEN_SHIFT_COUNT,
201204
LLM_KV_INTERLEAVE_MOE_LAYER_STEP,
205+
LLM_KV_FULL_ATTENTION_INTERVAL,
202206

203207
LLM_KV_ATTENTION_HEAD_COUNT,
204208
LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -226,6 +230,9 @@ enum llm_kv {
226230
LLM_KV_ATTENTION_TEMPERATURE_SCALE,
227231
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
228232
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
233+
LLM_KV_ATTENTION_INDEXER_HEAD_COUNT,
234+
LLM_KV_ATTENTION_INDEXER_KEY_LENGTH,
235+
LLM_KV_ATTENTION_INDEXER_TOP_K,
229236

230237
LLM_KV_ROPE_DIMENSION_COUNT,
231238
LLM_KV_ROPE_DIMENSION_SECTIONS,
@@ -404,13 +411,14 @@ enum llm_tensor {
404411
LLM_TENSOR_SSM_NORM,
405412
LLM_TENSOR_SSM_OUT,
406413
LLM_TENSOR_SSM_BETA_ALPHA, // qwen3next
414+
LLM_TENSOR_SSM_ALPHA, // qwen3.5
407415
// Kimi Linear KDA (using SSM_ prefix for consistency)
408416
LLM_TENSOR_SSM_CONV1D_Q, // kimi: Q conv1d weight
409417
LLM_TENSOR_SSM_CONV1D_K, // kimi: K conv1d weight
410418
LLM_TENSOR_SSM_CONV1D_V, // kimi: V conv1d weight
411419
LLM_TENSOR_SSM_F_A, // kimi: forget gate projection A
412420
LLM_TENSOR_SSM_F_B, // kimi: forget gate projection B
413-
LLM_TENSOR_SSM_BETA, // kimi: beta mixing coefficient
421+
LLM_TENSOR_SSM_BETA, // kimi: beta mixing coefficient and qwen3.5
414422
LLM_TENSOR_SSM_G_A, // kimi: output gate projection A
415423
LLM_TENSOR_SSM_G_B, // kimi: output gate projection B
416424
LLM_TENSOR_TIME_MIX_W0,
@@ -513,6 +521,10 @@ enum llm_tensor {
513521
LLM_TENSOR_VISEXP_FFN_GATE,
514522
LLM_TENSOR_VISEXP_FFN_DOWN,
515523
LLM_TENSOR_VISEXP_FFN_UP,
524+
LLM_TENSOR_INDEXER_K_NORM,
525+
LLM_TENSOR_INDEXER_PROJ,
526+
LLM_TENSOR_INDEXER_ATTN_K,
527+
LLM_TENSOR_INDEXER_ATTN_Q_B,
516528
LLM_TENSOR_NEXTN_EH_PROJ,
517529
LLM_TENSOR_NEXTN_EMBED_TOKENS,
518530
LLM_TENSOR_NEXTN_ENORM,

0 commit comments

Comments
 (0)