Skip to content

Commit 3b8abb7

Browse files
authored
FEAT: support Qwen3 and Qwen3MOE (#3347)
1 parent 590140b commit 3b8abb7

File tree

3 files changed

+365
-0
lines changed

3 files changed

+365
-0
lines changed

xinference/model/llm/llm_family.json

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11411,5 +11411,178 @@
1141111411
"<|im_start|>",
1141211412
"<|im_end|>"
1141311413
]
11414+
},
11415+
{
11416+
"version": 1,
11417+
"context_length": 32768,
11418+
"model_name": "qwen3",
11419+
"model_lang": [
11420+
"en",
11421+
"zh"
11422+
],
11423+
"model_ability": [
11424+
"generate",
11425+
"chat",
11426+
"reasoning",
11427+
"tools"
11428+
],
11429+
"model_description": "Qwen3 is the latest generation of large language models in Qwen series, offering a comprehensive suite of dense and mixture-of-experts (MoE) models. Built upon extensive training, Qwen3 delivers groundbreaking advancements in reasoning, instruction-following, agent capabilities, and multilingual support",
11430+
"model_specs": [
11431+
{
11432+
"model_format": "pytorch",
11433+
"model_size_in_billions": "0_6",
11434+
"quantizations": [
11435+
"4-bit",
11436+
"8-bit",
11437+
"none"
11438+
],
11439+
"model_id": "Qwen/Qwen3-0.6B"
11440+
},
11441+
{
11442+
"model_format": "fp8",
11443+
"model_size_in_billions": "0_6",
11444+
"quantizations": [
11445+
"fp8"
11446+
],
11447+
"model_id": "Qwen/Qwen3-0.6B-FP8"
11448+
},
11449+
{
11450+
"model_format": "pytorch",
11451+
"model_size_in_billions": "1_7",
11452+
"quantizations": [
11453+
"4-bit",
11454+
"8-bit",
11455+
"none"
11456+
],
11457+
"model_id": "Qwen/Qwen3-1.7B"
11458+
},
11459+
{
11460+
"model_format": "fp8",
11461+
"model_size_in_billions": "1_7",
11462+
"quantizations": [
11463+
"fp8"
11464+
],
11465+
"model_id": "Qwen/Qwen3-1.7B-FP8"
11466+
},
11467+
{
11468+
"model_format": "pytorch",
11469+
"model_size_in_billions": 4,
11470+
"quantizations": [
11471+
"4-bit",
11472+
"8-bit",
11473+
"none"
11474+
],
11475+
"model_id": "Qwen/Qwen3-4B"
11476+
},
11477+
{
11478+
"model_format": "fp8",
11479+
"model_size_in_billions": 4,
11480+
"quantizations": [
11481+
"fp8"
11482+
],
11483+
"model_id": "Qwen/Qwen3-4B-FP8"
11484+
},
11485+
{
11486+
"model_format": "pytorch",
11487+
"model_size_in_billions": 8,
11488+
"quantizations": [
11489+
"4-bit",
11490+
"8-bit",
11491+
"none"
11492+
],
11493+
"model_id": "Qwen/Qwen3-8B"
11494+
},
11495+
{
11496+
"model_format": "fp8",
11497+
"model_size_in_billions": 8,
11498+
"quantizations": [
11499+
"fp8"
11500+
],
11501+
"model_id": "Qwen/Qwen3-8B-FP8"
11502+
},
11503+
{
11504+
"model_format": "pytorch",
11505+
"model_size_in_billions": 14,
11506+
"quantizations": [
11507+
"4-bit",
11508+
"8-bit",
11509+
"none"
11510+
],
11511+
"model_id": "Qwen/Qwen3-14B"
11512+
},
11513+
{
11514+
"model_format": "fp8",
11515+
"model_size_in_billions": 14,
11516+
"quantizations": [
11517+
"fp8"
11518+
],
11519+
"model_id": "Qwen/Qwen3-14B-FP8"
11520+
},
11521+
{
11522+
"model_format": "pytorch",
11523+
"model_size_in_billions": 30,
11524+
"quantizations": [
11525+
"4-bit",
11526+
"8-bit",
11527+
"none"
11528+
],
11529+
"model_id": "Qwen/Qwen3-30B-A3B"
11530+
},
11531+
{
11532+
"model_format": "fp8",
11533+
"model_size_in_billions": 30,
11534+
"quantizations": [
11535+
"fp8"
11536+
],
11537+
"model_id": "Qwen/Qwen3-30B-FP8"
11538+
},
11539+
{
11540+
"model_format": "pytorch",
11541+
"model_size_in_billions": 32,
11542+
"quantizations": [
11543+
"4-bit",
11544+
"8-bit",
11545+
"none"
11546+
],
11547+
"model_id": "Qwen/Qwen3-32B"
11548+
},
11549+
{
11550+
"model_format": "fp8",
11551+
"model_size_in_billions": 32,
11552+
"quantizations": [
11553+
"fp8"
11554+
],
11555+
"model_id": "Qwen/Qwen3-32B-FP8"
11556+
},
11557+
{
11558+
"model_format": "pytorch",
11559+
"model_size_in_billions": 235,
11560+
"quantizations": [
11561+
"4-bit",
11562+
"8-bit",
11563+
"none"
11564+
],
11565+
"model_id": "Qwen/Qwen3-235B"
11566+
},
11567+
{
11568+
"model_format": "fp8",
11569+
"model_size_in_billions": 235,
11570+
"quantizations": [
11571+
"fp8"
11572+
],
11573+
"model_id": "Qwen/Qwen3-235B-FP8"
11574+
}
11575+
],
11576+
"chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
11577+
"stop_token_ids": [
11578+
151643,
11579+
151644,
11580+
151645
11581+
],
11582+
"stop": [
11583+
"<|endoftext|>",
11584+
"<|im_start|>",
11585+
"<|im_end|>"
11586+
]
1141411587
}
1141511588
]

xinference/model/llm/llm_family_modelscope.json

Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9214,5 +9214,194 @@
92149214
"<|im_start|>",
92159215
"<|im_end|>"
92169216
]
9217+
},
9218+
{
9219+
"version": 1,
9220+
"context_length": 32768,
9221+
"model_name": "qwen3",
9222+
"model_lang": [
9223+
"en",
9224+
"zh"
9225+
],
9226+
"model_ability": [
9227+
"generate",
9228+
"chat",
9229+
"reasoning",
9230+
"tools"
9231+
],
9232+
"model_description": "Qwen3 is the latest generation of large language models in Qwen series, offering a comprehensive suite of dense and mixture-of-experts (MoE) models. Built upon extensive training, Qwen3 delivers groundbreaking advancements in reasoning, instruction-following, agent capabilities, and multilingual support",
9233+
"model_specs": [
9234+
{
9235+
"model_format": "pytorch",
9236+
"model_size_in_billions": "0_6",
9237+
"quantizations": [
9238+
"4-bit",
9239+
"8-bit",
9240+
"none"
9241+
],
9242+
"model_id": "Qwen/Qwen3-0.6B",
9243+
"model_hub": "modelscope"
9244+
},
9245+
{
9246+
"model_format": "fp8",
9247+
"model_size_in_billions": "0_6",
9248+
"quantizations": [
9249+
"fp8"
9250+
],
9251+
"model_id": "Qwen/Qwen3-0.6B-FP8",
9252+
"model_hub": "modelscope"
9253+
},
9254+
{
9255+
"model_format": "pytorch",
9256+
"model_size_in_billions": "1_7",
9257+
"quantizations": [
9258+
"4-bit",
9259+
"8-bit",
9260+
"none"
9261+
],
9262+
"model_id": "Qwen/Qwen3-1.7B",
9263+
"model_hub": "modelscope"
9264+
},
9265+
{
9266+
"model_format": "fp8",
9267+
"model_size_in_billions": "1_7",
9268+
"quantizations": [
9269+
"fp8"
9270+
],
9271+
"model_id": "Qwen/Qwen3-1.7B-FP8",
9272+
"model_hub": "modelscope"
9273+
},
9274+
{
9275+
"model_format": "pytorch",
9276+
"model_size_in_billions": 4,
9277+
"quantizations": [
9278+
"4-bit",
9279+
"8-bit",
9280+
"none"
9281+
],
9282+
"model_id": "Qwen/Qwen3-4B",
9283+
"model_hub": "modelscope"
9284+
},
9285+
{
9286+
"model_format": "fp8",
9287+
"model_size_in_billions": 4,
9288+
"quantizations": [
9289+
"fp8"
9290+
],
9291+
"model_id": "Qwen/Qwen3-4B-FP8",
9292+
"model_hub": "modelscope"
9293+
},
9294+
{
9295+
"model_format": "pytorch",
9296+
"model_size_in_billions": 8,
9297+
"quantizations": [
9298+
"4-bit",
9299+
"8-bit",
9300+
"none"
9301+
],
9302+
"model_id": "Qwen/Qwen3-8B",
9303+
"model_hub": "modelscope"
9304+
},
9305+
{
9306+
"model_format": "fp8",
9307+
"model_size_in_billions": 8,
9308+
"quantizations": [
9309+
"fp8"
9310+
],
9311+
"model_id": "Qwen/Qwen3-8B-FP8",
9312+
"model_hub": "modelscope"
9313+
},
9314+
{
9315+
"model_format": "pytorch",
9316+
"model_size_in_billions": 14,
9317+
"quantizations": [
9318+
"4-bit",
9319+
"8-bit",
9320+
"none"
9321+
],
9322+
"model_id": "Qwen/Qwen3-14B",
9323+
"model_hub": "modelscope"
9324+
},
9325+
{
9326+
"model_format": "fp8",
9327+
"model_size_in_billions": 14,
9328+
"quantizations": [
9329+
"fp8"
9330+
],
9331+
"model_id": "Qwen/Qwen3-14B-FP8",
9332+
"model_hub": "modelscope"
9333+
},
9334+
{
9335+
"model_format": "pytorch",
9336+
"model_size_in_billions": 30,
9337+
"quantizations": [
9338+
"4-bit",
9339+
"8-bit",
9340+
"none"
9341+
],
9342+
"model_id": "Qwen/Qwen3-30B-A3B",
9343+
"model_hub": "modelscope"
9344+
},
9345+
{
9346+
"model_format": "fp8",
9347+
"model_size_in_billions": 30,
9348+
"quantizations": [
9349+
"fp8"
9350+
],
9351+
"model_id": "Qwen/Qwen3-30B-A3B-FP8",
9352+
"model_hub": "modelscope"
9353+
},
9354+
{
9355+
"model_format": "pytorch",
9356+
"model_size_in_billions": 32,
9357+
"quantizations": [
9358+
"4-bit",
9359+
"8-bit",
9360+
"none"
9361+
],
9362+
"model_id": "Qwen/Qwen3-32B",
9363+
"model_hub": "modelscope"
9364+
},
9365+
{
9366+
"model_format": "fp8",
9367+
"model_size_in_billions": 32,
9368+
"quantizations": [
9369+
"fp8"
9370+
],
9371+
"model_id": "Qwen/Qwen3-32B-FP8",
9372+
"model_hub": "modelscope"
9373+
},
9374+
{
9375+
"model_format": "pytorch",
9376+
"model_size_in_billions": 235,
9377+
"quantizations": [
9378+
"4-bit",
9379+
"8-bit",
9380+
"none"
9381+
],
9382+
"model_id": "Qwen/Qwen3-235B",
9383+
"model_hub": "modelscope"
9384+
},
9385+
{
9386+
"model_format": "fp8",
9387+
"model_size_in_billions": 235,
9388+
"quantizations": [
9389+
"fp8"
9390+
],
9391+
"model_id": "Qwen/Qwen3-235B-FP8",
9392+
"model_hub": "modelscope"
9393+
}
9394+
],
9395+
"chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
9396+
"stop_token_ids": [
9397+
151643,
9398+
151644,
9399+
151645
9400+
],
9401+
"stop": [
9402+
"<|endoftext|>",
9403+
"<|im_start|>",
9404+
"<|im_end|>"
9405+
]
92179406
}
92189407
]

xinference/model/llm/vllm/core.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,9 @@ class VLLMGenerateConfig(TypedDict, total=False):
240240
if VLLM_INSTALLED and vllm.__version__ >= "0.8.4":
241241
VLLM_SUPPORTED_CHAT_MODELS.append("glm4-0414")
242242

243+
if VLLM_INSTALLED and vllm.__version__ >= "0.8.5":
244+
VLLM_SUPPORTED_CHAT_MODELS.append("qwen3")
245+
243246

244247
class VLLMModel(LLM):
245248
def __init__(

0 commit comments

Comments
 (0)