Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
8448b23
initial commit for branch glm45v
ddh0 Oct 14, 2025
70c8686
use F32 accumulators for GLM4V_MOE
ddh0 Oct 14, 2025
631d4fa
add arch
ddh0 Oct 14, 2025
2aa6985
llama-model : add placeholders
ddh0 Oct 14, 2025
d0e9dce
fix arch name for tensor names
ddh0 Oct 14, 2025
0a72591
Merge branch 'ggml-org:master' into glm45v-2
ddh0 Oct 15, 2025
01d085d
WIP conversion logic
ddh0 Oct 15, 2025
5633947
Merge branch 'ggml-org:master' into glm45v-2
ddh0 Oct 15, 2025
14cee9c
better class names
ddh0 Oct 15, 2025
e0b6064
add `clip.vision.rope.*` to GGUF constants
ddh0 Oct 16, 2025
7bdc330
add `add_vision_rope_freq_base` for GGUF metadata
ddh0 Oct 16, 2025
ed7c271
set `clip.vision.rope.freq_base` during conversion
ddh0 Oct 16, 2025
9e9a4a8
Merge branch 'ggml-org:master' into glm45v-2
ddh0 Oct 16, 2025
a41109d
Merge branch 'ggml-org:master' into glm45v-2
ddh0 Oct 17, 2025
9d77113
Merge branch 'ggml-org:master' into glm45v-2
ddh0 Oct 18, 2025
9058117
Merge branch 'ggml-org:master' into glm45v-2
ddh0 Oct 20, 2025
1ed15dc
Merge branch 'master' into glm45v-2
ddh0 Oct 27, 2025
fd6236f
Merge branch 'master' into glm45v-2
ddh0 Oct 27, 2025
fb83bb5
Merge branch 'master' into glm45v-2
ddh0 Oct 30, 2025
7d05d7c
Merge branch 'master' into glm45v-2
ddh0 Nov 1, 2025
deb1399
Merge branch 'ggml-org:master' into glm45v-2
ddh0 Nov 3, 2025
e729ace
Merge branch 'ggml-org:master' into glm45v-2
ddh0 Nov 4, 2025
99e8e6a
use the same ViT for GLM-4.1V and GLM-4.5V
ddh0 Nov 5, 2025
94e8983
separate architectures for GLM4V and GLM4V_MOE
ddh0 Nov 5, 2025
484d18c
fix typo
ddh0 Nov 5, 2025
c84a431
add GLM4V arch tensor map
ddh0 Nov 5, 2025
7267e8a
fix typo
ddh0 Nov 5, 2025
ac54c71
add `glm4v` and `glm4v_moe` to src/models
ddh0 Nov 5, 2025
e3009de
Merge branch 'ggml-org:master' into glm45v-2
ddh0 Nov 5, 2025
c17d4b9
revert old RoPE GGUF changes
ddh0 Nov 5, 2025
8a6ad0c
begin adding GLM4V projector
ddh0 Nov 5, 2025
f39b231
copy LLM graph code from text models (WIP)
ddh0 Nov 5, 2025
b60c16a
consistent arch naming
ddh0 Nov 5, 2025
6443ecb
WIP conversion logic
ddh0 Nov 6, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 80 additions & 0 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -9807,6 +9807,86 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter

return [] # skip other tensors

@ModelBase.register("Glm4vForConditionalGeneration")
class GLM4VModel(Glm4Model):
"""Text model from [zai-org/GLM-4.1V-9B-Thinking](https://huggingface.co/zai-org/GLM-4.1V-9B-Thinking)

ref: [#16600](https://github.com/ggml-org/llama.cpp/pull/16600)"""
model_arch = gguf.MODEL_ARCH.GLM4V

def set_gguf_parameters(self):
super().set_gguf_parameters()

def modify_tensors(
self, data_torch: Tensor, name: str, bid: int | None
) -> Iterable[tuple[str, Tensor]]:
# skip vision tensors for the text model
if name.startswith("model.visual."):
return []

# the Glm4Model class expects tensor names to start with 'model.',
# so we strip the we strip the 'language_model.' part
if name.startswith("model.language_model."):
name = name.replace("model.language_model.", "model.", 1)

# let the Glm4Model class handle the tensor mapping
yield from super().modify_tensors(data_torch, name, bid)


@ModelBase.register("Glm4vMoeForConditionalGeneration")
class GLM4VMoEModel(Glm4MoeModel):
"""Text model from [zai-org/GLM-4.5V](https://huggingface.co/zai-org/GLM-4.5V)

ref: [#16600](https://github.com/ggml-org/llama.cpp/pull/16600)"""
model_arch = gguf.MODEL_ARCH.GLM4V_MOE

def set_gguf_parameters(self):
# parameters specific to GLM-4.5V like rope_theta=10000 and context_length=65536
# should be correctly picked up from the text_config by the base classes
super().set_gguf_parameters()

def modify_tensors(
self, data_torch: Tensor, name: str, bid: int | None
) -> Iterable[tuple[str, Tensor]]:
# skip vision tensors for the text model
if name.startswith("model.visual."):
return []

# the Glm4MoeModel class expects tensor names to start with 'model.',
# so we strip the we strip the 'language_model.' part
if name.startswith("model.language_model."):
name = name.replace("model.language_model.", "model.", 1)

# let the Glm4MoeModel class handle the MoE logic and tensor mapping
yield from super().modify_tensors(data_torch, name, bid)


@ModelBase.register("Glm4vMoeForConditionalGeneration", "Glm4vForConditionalGeneration")
class GLM4VisionModel(MmprojModel):
"""Multimodal projector from:
- [zai-org/GLM-4.1V-9B-Thinking](https://huggingface.co/zai-org/GLM-4.1V-9B-Thinking)
- [zai-org/GLM-4.5V](https://huggingface.co/zai-org/GLM-4.5V)

ref: [#16600](https://github.com/ggml-org/llama.cpp/pull/16600)"""
#
# TODO: conversion logic is still WIP!
#
def set_gguf_parameters(self):
super().set_gguf_parameters()
assert self.hparams_vision is not None
vparams = self.hparams_vision
ln_eps = vparams.get("layer_norm_eps", 1e-5)

self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GLM4V)
self.gguf_writer.add_vision_attention_layernorm_eps(ln_eps)
self.gguf_writer.add_vision_use_silu(True)

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unused
if name.startswith("model.visual."):
yield self.map_tensor_name(name), data_torch
else:
return

@ModelBase.register("CogVLMForCausalLM")
class CogVLMVisionModel(MmprojModel):
Expand Down
57 changes: 52 additions & 5 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,8 @@ class MODEL_ARCH(IntEnum):
CHATGLM = auto()
GLM4 = auto()
GLM4_MOE = auto()
GLM4V = auto()
GLM4V_MOE = auto()
BITNET = auto()
T5 = auto()
T5ENCODER = auto()
Expand Down Expand Up @@ -437,6 +439,7 @@ class VISION_PROJECTOR_TYPE(IntEnum):
GLM_EDGE = auto()
MERGER = auto()
GEMMA3 = auto()
GLM4V = auto()
QWEN3VL = auto()
COGVLM = auto()

Expand Down Expand Up @@ -683,10 +686,10 @@ class MODEL_TENSOR(IntEnum):
A_MM_NORM_PRE = auto()
A_MM_NORM_MID = auto()
# nextn/mtp
NEXTN_EH_PROJ = auto()
NEXTN_EMBED_TOKENS = auto()
NEXTN_ENORM = auto()
NEXTN_HNORM = auto()
NEXTN_EH_PROJ = auto()
NEXTN_EMBED_TOKENS = auto()
NEXTN_ENORM = auto()
NEXTN_HNORM = auto()
NEXTN_SHARED_HEAD_HEAD = auto()
NEXTN_SHARED_HEAD_NORM = auto()

Expand Down Expand Up @@ -757,7 +760,9 @@ class MODEL_TENSOR(IntEnum):
MODEL_ARCH.DEEPSEEK2: "deepseek2",
MODEL_ARCH.CHATGLM: "chatglm",
MODEL_ARCH.GLM4: "glm4",
MODEL_ARCH.GLM4_MOE: "glm4moe",
MODEL_ARCH.GLM4_MOE: "glm4_moe",
MODEL_ARCH.GLM4V: "glm4v",
MODEL_ARCH.GLM4V_MOE: "glm4v_moe",
MODEL_ARCH.BITNET: "bitnet",
MODEL_ARCH.T5: "t5",
MODEL_ARCH.T5ENCODER: "t5encoder",
Expand Down Expand Up @@ -805,6 +810,7 @@ class MODEL_TENSOR(IntEnum):
VISION_PROJECTOR_TYPE.GLM_EDGE: "adapter",
VISION_PROJECTOR_TYPE.MERGER: "qwen2vl_merger",
VISION_PROJECTOR_TYPE.GEMMA3: "gemma3",
VISION_PROJECTOR_TYPE.GLM4V: "glm4v",
}

TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
Expand Down Expand Up @@ -2365,6 +2371,46 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD,
MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM,
],
MODEL_ARCH.GLM4V : [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_QKV,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.ATTN_POST_NORM,
MODEL_TENSOR.FFN_POST_NORM,
],
MODEL_ARCH.GLM4V_MOE: [ # same as GLM4_MOE without MTP tensors
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_POST_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.ATTN_Q_NORM,
MODEL_TENSOR.ATTN_K_NORM,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.FFN_GATE_INP,
MODEL_TENSOR.FFN_GATE_EXP,
MODEL_TENSOR.FFN_DOWN_EXP,
MODEL_TENSOR.FFN_UP_EXP,
MODEL_TENSOR.FFN_GATE_SHEXP,
MODEL_TENSOR.FFN_DOWN_SHEXP,
MODEL_TENSOR.FFN_UP_SHEXP,
MODEL_TENSOR.FFN_EXP_PROBS_B,
],
MODEL_ARCH.BITNET: [
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
Expand Down Expand Up @@ -3204,6 +3250,7 @@ class VisionProjectorType:
VOXTRAL = "voxtral"
LFM2 = "lfm2"
KIMIVL = "kimivl"
GLM4V = "glm4v"
LIGHTONOCR = "lightonocr"
COGVLM = "cogvlm"
JANUS_PRO = "janus_pro"
Expand Down
51 changes: 49 additions & 2 deletions src/llama-arch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,9 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_DEEPSEEK2, "deepseek2" },
{ LLM_ARCH_CHATGLM, "chatglm" },
{ LLM_ARCH_GLM4, "glm4" },
{ LLM_ARCH_GLM4_MOE, "glm4moe" },
{ LLM_ARCH_GLM4_MOE, "glm4_moe" },
{ LLM_ARCH_GLM4V, "glm4v" },
{ LLM_ARCH_GLM4V_MOE, "glm4v_moe" },
{ LLM_ARCH_BITNET, "bitnet" },
{ LLM_ARCH_T5, "t5" },
{ LLM_ARCH_T5ENCODER, "t5encoder" },
Expand Down Expand Up @@ -1506,7 +1508,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
LLM_ARCH_GLM4,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, // does this really exist?
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
Expand Down Expand Up @@ -1555,6 +1557,51 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "blk.%d.nextn.shared_head_norm" },
},
},
{
LLM_ARCH_GLM4V,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
},
},
{
LLM_ARCH_GLM4V_MOE,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
{ LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
},
},
{
LLM_ARCH_BITNET,
{
Expand Down
3 changes: 2 additions & 1 deletion src/llama-arch.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ enum llm_arch {
LLM_ARCH_CHATGLM,
LLM_ARCH_GLM4,
LLM_ARCH_GLM4_MOE,
LLM_ARCH_GLM4V,
LLM_ARCH_GLM4V_MOE,
LLM_ARCH_BITNET,
LLM_ARCH_T5,
LLM_ARCH_T5ENCODER,
Expand Down Expand Up @@ -129,7 +131,6 @@ enum llm_kv {
LLM_KV_GENERAL_LICENSE,
LLM_KV_GENERAL_SOURCE_URL,
LLM_KV_GENERAL_SOURCE_HF_REPO,

LLM_KV_VOCAB_SIZE,
LLM_KV_CONTEXT_LENGTH,
LLM_KV_EMBEDDING_LENGTH,
Expand Down
18 changes: 14 additions & 4 deletions src/llama-graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -820,8 +820,13 @@ ggml_tensor * llm_graph_context::build_ffn(

if (down) {
cur = build_lora_mm(down, cur);
if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) {
// GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators
if (
arch == LLM_ARCH_GLM4 ||
arch == LLM_ARCH_GLM4_MOE ||
arch == LLM_ARCH_GLM4V ||
arch == LLM_ARCH_GLM4V_MOE
) {
// GLM4 models seem to have numerical issues with half-precision accumulators
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
}
}
Expand Down Expand Up @@ -1618,8 +1623,13 @@ ggml_tensor * llm_graph_context::build_attn(

if (wo) {
cur = build_lora_mm(wo, cur);
if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) {
// GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators
if (
arch == LLM_ARCH_GLM4 ||
arch == LLM_ARCH_GLM4_MOE ||
arch == LLM_ARCH_GLM4V ||
arch == LLM_ARCH_GLM4V_MOE
) {
// GLM4 models seem to have numerical issues with half-precision accumulators
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
}
}
Expand Down
28 changes: 28 additions & 0 deletions src/llama-model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1665,6 +1665,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_GLM4V:
{
// TODO
} break;
case LLM_ARCH_GLM4V_MOE:
{
// TODO
} break;
case LLM_ARCH_BITNET:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
Expand Down Expand Up @@ -5016,6 +5024,16 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
}
}
break;
case LLM_ARCH_GLM4V:
{
// TODO
}
break;
case LLM_ARCH_GLM4V_MOE:
{
// TODO
}
break;
case LLM_ARCH_NEMOTRON:
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
Expand Down Expand Up @@ -7151,6 +7169,14 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
{
llm = std::make_unique<llm_build_glm4_moe>(*this, params);
} break;
case LLM_ARCH_GLM4V:
{
llm = std::make_unique<llm_build_glm4v>(*this, params);
} break;
case LLM_ARCH_GLM4V_MOE:
{
llm = std::make_unique<llm_build_glm4v_moe>(*this, params);
} break;
case LLM_ARCH_BITNET:
{
llm = std::make_unique<llm_build_bitnet>(*this, params);
Expand Down Expand Up @@ -7540,6 +7566,8 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
return LLAMA_ROPE_TYPE_NEOX;

case LLM_ARCH_QWEN2VL:
case LLM_ARCH_GLM4V:
case LLM_ARCH_GLM4V_MOE:
return LLAMA_ROPE_TYPE_MROPE;
case LLM_ARCH_QWEN3VL:
case LLM_ARCH_QWEN3VLMOE:
Expand Down
Loading
Loading