From a3d5a0ac18d5b2614dad3dbe525088d6b683eb52 Mon Sep 17 00:00:00 2001 From: llyycchhee Date: Sat, 28 Feb 2026 11:01:05 +0800 Subject: [PATCH 1/3] [models-hub] Update llm:glm-5 --- xinference/model/llm/llm_family.json | 56 ++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json index 9722a492bf..cef6264637 100644 --- a/xinference/model/llm/llm_family.json +++ b/xinference/model/llm/llm_family.json @@ -26202,5 +26202,61 @@ }, "featured": false, "updated_at": 1772095707 + }, + { + "model_name": "glm-5", + "model_description": "We are launching GLM-5, targeting complex systems engineering and long-horizon agentic tasks. Scaling is still one of the most important ways to improve the intelligence efficiency of Artificial General Intelligence (AGI). Compared to GLM-4.5, GLM-5 scales from 355B parameters (32B active) to 744B parameters (40B active), and increases pre-training data from 23T to 28.5T tokens. GLM-5 also integrates DeepSeek Sparse Attention (DSA), largely reducing deployment cost while preserving long-context capacity. Reinforcement learning aims to bridge the gap between competence and excellence in pre-trained models. However, deploying it at scale for LLMs is a challenge due to the RL training inefficiency. To this end, we developed slime, a novel asynchronous RL infrastructure that substantially improves training throughput and efficiency, enabling more fine-grained post-training iterations. With advances in both pre-training and post-training, GLM-5 delivers significant improvement compared to GLM-4.7 across a wide range of academic benchmarks and achieves best-in-class performance among all open-source models in the world on reasoning, coding, and agentic tasks, closing the gap with frontier models.", + "context_length": 202752, + "model_lang": [ + "en", + "zh" + ], + "model_ability": [ + "chat", + "vision", + "tools", + "reasoning" + ], + "model_specs": [ + { + "model_size_in_billions": 744, + "model_format": "pytorch", + "model_src": { + "huggingface": { + "model_id": "zai-org/GLM-5" + }, + "modelscope": { + "model_id": "ZhipuAI/GLM-5" + } + } + } + ], + "architectures": [ + "GlmMoeDsaForCausalLM" + ], + "chat_template": "[gMASK]\n{%- if tools -%}\n<|system|>\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n\n{% for tool in tools %}\n{{ tool | tojson(ensure_ascii=False) }}\n{% endfor %}\n\n\nFor each function call, output the function name and arguments within the following XML format:\n{function-name}{arg-key-1}{arg-value-1}{arg-key-2}{arg-value-2}...{%- endif -%}\n{%- macro visible_text(content) -%}\n {%- if content is string -%}\n {{- content }}\n {%- elif content is iterable and content is not mapping -%}\n {%- for item in content -%}\n {%- if item is mapping and item.type == 'text' -%}\n {{- item.text }}\n {%- elif item is string -%}\n {{- item }}\n {%- endif -%}\n {%- endfor -%}\n {%- else -%}\n {{- content }}\n {%- endif -%}\n{%- endmacro -%}\n{%- set ns = namespace(last_user_index=-1) %}\n{%- for m in messages %}\n {%- if m.role == 'user' %}\n {% set ns.last_user_index = loop.index0 -%}\n {%- endif %}\n{%- endfor %}\n{% for m in messages %}\n{%- if m.role == 'user' -%}<|user|>{{ visible_text(m.content) }}\n{%- elif m.role == 'assistant' -%}\n<|assistant|>\n{%- set reasoning_content = '' %}\n{%- set content = visible_text(m.content) %}\n{%- if m.reasoning_content is string %}\n {%- set reasoning_content = m.reasoning_content %}\n{%- else %}\n {%- if '' in content %}\n {%- set reasoning_content = content.split('')[0].rstrip('\\n').split('')[-1].lstrip('\\n') %}\n {%- set content = content.split('')[-1].lstrip('\\n') %}\n {%- endif %}\n{%- endif %}\n{%- if ((clear_thinking is defined and not clear_thinking) or loop.index0 > ns.last_user_index) and reasoning_content -%}\n{{ '' + reasoning_content.strip() + ''}}\n{%- else -%}\n{{ '' }}\n{%- endif -%}\n{%- if content.strip() -%}\n{{ content.strip() }}\n{%- endif -%}\n{% if m.tool_calls %}\n{% for tc in m.tool_calls %}\n{%- if tc.function %}\n {%- set tc = tc.function %}\n{%- endif %}\n{{- '' + tc.name -}}\n{% set _args = tc.arguments %}{% for k, v in _args.items() %}{{ k }}{{ v | tojson(ensure_ascii=False) if v is not string else v }}{% endfor %}{% endfor %}\n{% endif %}\n{%- elif m.role == 'tool' -%}\n{%- if m.content is string -%}\n{%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|observation|>' }}\n{%- endif %}\n{{- '' }}\n{{- m.content }}\n{{- '' }}\n{%- else -%}\n<|observation|>{% for tr in m.content %}\n{{ tr.output if tr.output is defined else tr }}{% endfor -%}\n{% endif -%}\n{%- elif m.role == 'system' -%}\n<|system|>{{ visible_text(m.content) }}\n{%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n <|assistant|>{{- '' if (enable_thinking is defined and not enable_thinking) else '' -}}\n{%- endif -%}", + "stop_token_ids": [ + 154820, + 154827, + 154829 + ], + "stop": [ + "<|endoftext|>" + ], + "tool_parser": "glm4", + "reasoning_start_tag": "", + "reasoning_end_tag": "", + "version": 2, + "virtualenv": { + "packages": [ + "#transformers_dependencies# ; #engine# == \"Transformers\"", + "#mlx_dependencies# ; #engine# == \"MLX\"", + "#vllm_dependencies# ; #engine# == \"vllm\"", + "#sglang_dependencies# ; #engine# == \"sglang\"", + "#system_numpy# ; #engine# == \"vllm\"" + ] + }, + "featured": false, + "updated_at": 1772247655 } ] From 261293ae0ba7b170e017b302ec14324e4a26ccdc Mon Sep 17 00:00:00 2001 From: llyycchhee Date: Sat, 28 Feb 2026 11:10:21 +0800 Subject: [PATCH 2/3] [models-hub] Update llm:glm-5 --- xinference/model/llm/llm_family.json | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json index cef6264637..4ca8a64b38 100644 --- a/xinference/model/llm/llm_family.json +++ b/xinference/model/llm/llm_family.json @@ -26223,10 +26223,16 @@ "model_format": "pytorch", "model_src": { "huggingface": { - "model_id": "zai-org/GLM-5" + "model_id": "zai-org/GLM-5", + "quantizations": [ + "none" + ] }, "modelscope": { - "model_id": "ZhipuAI/GLM-5" + "model_id": "ZhipuAI/GLM-5", + "quantizations": [ + "none" + ] } } } @@ -26257,6 +26263,6 @@ ] }, "featured": false, - "updated_at": 1772247655 + "updated_at": 1772248212 } ] From 755f8eb8e438e6c67fafe348e76c428fcd477f1e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Sat, 28 Feb 2026 03:13:15 +0000 Subject: [PATCH 3/3] chore(docs): auto-run gen_docs.py --- doc/source/models/builtin/llm/glm-5.rst | 31 +++++++++++++++++++++++++ doc/source/models/builtin/llm/index.rst | 7 ++++++ 2 files changed, 38 insertions(+) create mode 100644 doc/source/models/builtin/llm/glm-5.rst diff --git a/doc/source/models/builtin/llm/glm-5.rst b/doc/source/models/builtin/llm/glm-5.rst new file mode 100644 index 0000000000..a8fc15260e --- /dev/null +++ b/doc/source/models/builtin/llm/glm-5.rst @@ -0,0 +1,31 @@ +.. _models_llm_glm-5: + +======================================== +glm-5 +======================================== + +- **Context Length:** 202752 +- **Model Name:** glm-5 +- **Languages:** en, zh +- **Abilities:** chat, vision, tools, reasoning +- **Description:** We are launching GLM-5, targeting complex systems engineering and long-horizon agentic tasks. Scaling is still one of the most important ways to improve the intelligence efficiency of Artificial General Intelligence (AGI). Compared to GLM-4.5, GLM-5 scales from 355B parameters (32B active) to 744B parameters (40B active), and increases pre-training data from 23T to 28.5T tokens. GLM-5 also integrates DeepSeek Sparse Attention (DSA), largely reducing deployment cost while preserving long-context capacity. Reinforcement learning aims to bridge the gap between competence and excellence in pre-trained models. However, deploying it at scale for LLMs is a challenge due to the RL training inefficiency. To this end, we developed slime, a novel asynchronous RL infrastructure that substantially improves training throughput and efficiency, enabling more fine-grained post-training iterations. With advances in both pre-training and post-training, GLM-5 delivers significant improvement compared to GLM-4.7 across a wide range of academic benchmarks and achieves best-in-class performance among all open-source models in the world on reasoning, coding, and agentic tasks, closing the gap with frontier models. + +Specifications +^^^^^^^^^^^^^^ + + +Model Spec 1 (pytorch, 744 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 744 +- **Quantizations:** none +- **Engines**: Transformers +- **Model ID:** zai-org/GLM-5 +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name glm-5 --size-in-billions 744 --model-format pytorch --quantization ${quantization} + diff --git a/doc/source/models/builtin/llm/index.rst b/doc/source/models/builtin/llm/index.rst index ecfa16630d..305b3071b5 100644 --- a/doc/source/models/builtin/llm/index.rst +++ b/doc/source/models/builtin/llm/index.rst @@ -236,6 +236,11 @@ The following is a list of built-in LLM in Xinference: - 8192 - GLM4 is the open source version of the latest generation of pre-trained models in the GLM-4 series launched by Zhipu AI. + * - :ref:`glm-5 ` + - chat, vision, tools, reasoning + - 202752 + - We are launching GLM-5, targeting complex systems engineering and long-horizon agentic tasks. Scaling is still one of the most important ways to improve the intelligence efficiency of Artificial General Intelligence (AGI). Compared to GLM-4.5, GLM-5 scales from 355B parameters (32B active) to 744B parameters (40B active), and increases pre-training data from 23T to 28.5T tokens. GLM-5 also integrates DeepSeek Sparse Attention (DSA), largely reducing deployment cost while preserving long-context capacity. Reinforcement learning aims to bridge the gap between competence and excellence in pre-trained models. However, deploying it at scale for LLMs is a challenge due to the RL training inefficiency. To this end, we developed slime, a novel asynchronous RL infrastructure that substantially improves training throughput and efficiency, enabling more fine-grained post-training iterations. With advances in both pre-training and post-training, GLM-5 delivers significant improvement compared to GLM-4.7 across a wide range of academic benchmarks and achieves best-in-class performance among all open-source models in the world on reasoning, coding, and agentic tasks, closing the gap with frontier models. + * - :ref:`glm-edge-chat ` - chat - 8192 @@ -839,6 +844,8 @@ The following is a list of built-in LLM in Xinference: glm-4v + glm-5 + glm-edge-chat glm4-0414