diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json index 32f3a5d971..4cbb3edb73 100644 --- a/xinference/model/llm/llm_family.json +++ b/xinference/model/llm/llm_family.json @@ -20930,6 +20930,126 @@ ] } }, + { + "version": 2, + "context_length": 131072, + "model_name": "glm-4.5v", + "model_lang": [ + "en", + "zh" + ], + "model_ability": [ + "chat", + "vision", + "reasoning" + ], + "model_description": "GLM-4.5V is based on ZhipuAI’s next-generation flagship text foundation model GLM-4.5-Air (106B parameters, 12B active). It continues the technical approach of GLM-4.1V-Thinking, achieving SOTA performance among models of the same scale on 42 public vision-language benchmarks.", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": 106, + "activated_size_in_billions": 12, + "model_src": { + "huggingface": { + "quantizations": [ + "none" + ], + "model_id": "zai-org/GLM-4.5V" + }, + "modelscope": { + "quantizations": [ + "none" + ], + "model_id": "ZhipuAI/GLM-4.5V" + } + } + }, + { + "model_format": "fp8", + "model_size_in_billions": 106, + "activated_size_in_billions": 12, + "model_src": { + "huggingface": { + "quantizations": [ + "FP8" + ], + "model_id": "zai-org/GLM-4.5V-FP8" + }, + "modelscope": { + "quantizations": [ + "FP8" + ], + "model_id": "ZhipuAI/GLM-4.5V-FP8" + } + } + }, + { + "model_format": "awq", + "model_size_in_billions": 106, + "activated_size_in_billions": 12, + "model_src": { + "huggingface": { + "quantizations": [ + "Int4" + ], + "model_id": "QuantTrio/GLM-4.5V-AWQ" + }, + "modelscope": { + "quantizations": [ + "Int4" + ], + "model_id": "tclf90/GLM-4.5V-AWQ" + } + } + }, + { + "model_format": "mlx", + "model_size_in_billions": 106, + "activated_size_in_billions": 12, + "model_src": { + "huggingface": { + "quantizations": [ + "3bit", + "4bit", + "5bit", + "6bit", + "8bit" + ], + "model_id": "mlx-community/GLM-4.5V-{quantization}" + }, + "modelscope": { + "quantizations": [ + "3bit", + "4bit", + "5bit", + "6bit", + "8bit" + ], + "model_id": "mlx-community/GLM-4.5V-{quantization}" + } + } + } + ], + "chat_template": "[gMASK]\n{%- if tools -%}\n<|system|>\n# Tools\nYou may call one or more functions to assist with the user query.\nYou are provided with function signatures within XML tags:\n\n{% for tool in tools %}\n{{ tool | tojson(ensure_ascii=False) }}\n{% endfor %}\n\nFor each function call, output the function name and arguments within the following XML format:\n{function-name}\n{arg-key-1}\n{arg-value-1}\n{arg-key-2}\n{arg-value-2}\n...\n{%- endif -%}\n{%- macro visible_text(content) -%}\n {%- if content is string -%}\n {{- content }}\n {%- elif content is iterable and content is not mapping -%}\n {%- for item in content -%}\n {%- if item is mapping and item.type == 'text' -%}\n {{- item.text }}\n {%- elif item is mapping and (item.type == 'image' or 'image' in item) -%}\n <|begin_of_image|><|image|><|end_of_image|>\n {%- elif item is mapping and (item.type == 'video' or 'video' in item) -%}\n <|begin_of_video|><|video|><|end_of_video|>\n {%- elif item is string -%}\n {{- item }}\n {%- endif -%}\n {%- endfor -%}\n {%- else -%}\n {{- content }}\n {%- endif -%}\n{%- endmacro -%}\n{%- set ns = namespace(last_user_index=-1) %}\n{%- for m in messages %}\n {%- if m.role == 'user' %}\n {% set ns.last_user_index = loop.index0 -%}\n {%- endif %}\n{%- endfor %}\n{% for m in messages %}\n{%- if m.role == 'user' -%}<|user|>\n{% if m.content is string %}\n{{ m.content }}\n{%- else %}\n{%- for item in m.content %}\n{% if item.type == 'video' or 'video' in item %}\n<|begin_of_video|><|video|><|end_of_video|>{% elif item.type == 'image' or 'image' in item %}\n<|begin_of_image|><|image|><|end_of_image|>{% elif item.type == 'text' %}\n{{ item.text }}\n{%- endif %}\n{%- endfor %}\n{%- endif %}\n{{- '/nothink' if (enable_thinking is defined and not enable_thinking and not visible_text(m.content).endswith(\"/nothink\")) else '' -}}\n{%- elif m.role == 'assistant' -%}\n<|assistant|>\n{%- set reasoning_content = '' %}\n{%- set content = visible_text(m.content) %}\n{%- if m.reasoning_content is string %}\n {%- set reasoning_content = m.reasoning_content %}\n{%- else %}\n {%- if '' in content %}\n {%- set reasoning_content = content.split('')[0].rstrip('\\n').split('')[-1].lstrip('\\n') %}\n {%- set content = content.split('')[-1].lstrip('\\n') %}\n {%- endif %}\n{%- endif %}\n{%- if loop.index0 > ns.last_user_index and reasoning_content -%}\n{{ '\\n' + reasoning_content.strip() + ''}}\n{%- else -%}\n{{ '\\n' }}\n{%- endif -%}\n{%- if content.strip() -%}\n{{ '\\n' + content.strip() }}\n{%- endif -%}\n{% if m.tool_calls %}\n{% for tc in m.tool_calls %}\n{%- if tc.function %}\n {%- set tc = tc.function %}\n{%- endif %}\n{{ '\\n' + tc.name }}\n{% set _args = tc.arguments %}\n{% for k, v in _args.items() %}\n{{ k }}\n{{ v | tojson(ensure_ascii=False) if v is not string else v }}\n{% endfor %}\n{% endfor %}\n{% endif %}\n{%- elif m.role == 'tool' -%}\n{%- if m.content is string -%}\n{%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|observation|>' }}\n{%- endif %}\n{{- '\\n\\n' }}\n{{- m.content }}\n{{- '\\n' }}\n{%- else -%}\n<|observation|>{% for tr in m.content %}\n\n{{ tr.output if tr.output is defined else tr }}\n{% endfor -%}\n{% endif -%}\n{%- elif m.role == 'system' -%}\n<|system|>\n{{ visible_text(m.content) }}\n{%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n<|assistant|>\n{{'\\n' if (enable_thinking is defined and not enable_thinking) else ''}}\n{%- endif -%}", + "stop_token_ids": [ + 151329, + 151336, + 151338 + ], + "stop": [ + "<|endoftext|>", + "<|user|>", + "<|observation|>" + ], + "reasoning_start_tag": "", + "reasoning_end_tag": "", + "virtualenv": { + "packages": [ + "transformers>=4.55.0", + "#system_numpy#" + ] + } + }, { "version": 2, "context_length": 131072, diff --git a/xinference/model/llm/transformers/multimodal/glm4_1v.py b/xinference/model/llm/transformers/multimodal/glm4_1v.py index 19710def98..43c8269d53 100644 --- a/xinference/model/llm/transformers/multimodal/glm4_1v.py +++ b/xinference/model/llm/transformers/multimodal/glm4_1v.py @@ -28,14 +28,14 @@ @register_transformer -@register_non_default_model("glm-4.1v-thinking") +@register_non_default_model("glm-4.1v-thinking", "glm-4.5v") class Glm4_1VModel(PytorchMultiModalModel): @classmethod def match_json( cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str ) -> bool: family = model_family.model_family or model_family.model_name - if "glm-4.1v" in family.lower(): + if "glm-4.1v" in family.lower() or "glm-4.5v" in family.lower(): return True return False diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py index 0a51463cc7..29f7e20ef3 100644 --- a/xinference/model/llm/vllm/core.py +++ b/xinference/model/llm/vllm/core.py @@ -89,6 +89,7 @@ class VLLMModelConfig(TypedDict, total=False): mm_processor_kwargs: NotRequired[dict[str, Any]] min_pixels: NotRequired[int] max_pixels: NotRequired[int] + enable_expert_parallel: bool class VLLMGenerateConfig(TypedDict, total=False): @@ -275,6 +276,7 @@ class VLLMGenerateConfig(TypedDict, total=False): if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.10.0"): VLLM_SUPPORTED_CHAT_MODELS.append("glm-4.5") + VLLM_SUPPORTED_VISION_MODEL_LIST.append("glm-4.5v") if VLLM_INSTALLED and VLLM_VERSION > version.parse("0.10.0"): VLLM_SUPPORTED_CHAT_MODELS.append("gpt-oss")