Skip to content

Commit 44f04f6

Browse files
Jun-Howieamumu96
authored andcommitted
FEAT: support Qwen3 and Qwen3MOE (xorbitsai#3347)
(cherry picked from commit 3b8abb7)
1 parent df73249 commit 44f04f6

File tree

3 files changed

+365
-0
lines changed

3 files changed

+365
-0
lines changed

xinference/model/llm/llm_family.json

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11403,5 +11403,178 @@
1140311403
"<|im_start|>",
1140411404
"<|im_end|>"
1140511405
]
11406+
},
11407+
{
11408+
"version": 1,
11409+
"context_length": 32768,
11410+
"model_name": "qwen3",
11411+
"model_lang": [
11412+
"en",
11413+
"zh"
11414+
],
11415+
"model_ability": [
11416+
"generate",
11417+
"chat",
11418+
"reasoning",
11419+
"tools"
11420+
],
11421+
"model_description": "Qwen3 is the latest generation of large language models in Qwen series, offering a comprehensive suite of dense and mixture-of-experts (MoE) models. Built upon extensive training, Qwen3 delivers groundbreaking advancements in reasoning, instruction-following, agent capabilities, and multilingual support",
11422+
"model_specs": [
11423+
{
11424+
"model_format": "pytorch",
11425+
"model_size_in_billions": "0_6",
11426+
"quantizations": [
11427+
"4-bit",
11428+
"8-bit",
11429+
"none"
11430+
],
11431+
"model_id": "Qwen/Qwen3-0.6B"
11432+
},
11433+
{
11434+
"model_format": "fp8",
11435+
"model_size_in_billions": "0_6",
11436+
"quantizations": [
11437+
"fp8"
11438+
],
11439+
"model_id": "Qwen/Qwen3-0.6B-FP8"
11440+
},
11441+
{
11442+
"model_format": "pytorch",
11443+
"model_size_in_billions": "1_7",
11444+
"quantizations": [
11445+
"4-bit",
11446+
"8-bit",
11447+
"none"
11448+
],
11449+
"model_id": "Qwen/Qwen3-1.7B"
11450+
},
11451+
{
11452+
"model_format": "fp8",
11453+
"model_size_in_billions": "1_7",
11454+
"quantizations": [
11455+
"fp8"
11456+
],
11457+
"model_id": "Qwen/Qwen3-1.7B-FP8"
11458+
},
11459+
{
11460+
"model_format": "pytorch",
11461+
"model_size_in_billions": 4,
11462+
"quantizations": [
11463+
"4-bit",
11464+
"8-bit",
11465+
"none"
11466+
],
11467+
"model_id": "Qwen/Qwen3-4B"
11468+
},
11469+
{
11470+
"model_format": "fp8",
11471+
"model_size_in_billions": 4,
11472+
"quantizations": [
11473+
"fp8"
11474+
],
11475+
"model_id": "Qwen/Qwen3-4B-FP8"
11476+
},
11477+
{
11478+
"model_format": "pytorch",
11479+
"model_size_in_billions": 8,
11480+
"quantizations": [
11481+
"4-bit",
11482+
"8-bit",
11483+
"none"
11484+
],
11485+
"model_id": "Qwen/Qwen3-8B"
11486+
},
11487+
{
11488+
"model_format": "fp8",
11489+
"model_size_in_billions": 8,
11490+
"quantizations": [
11491+
"fp8"
11492+
],
11493+
"model_id": "Qwen/Qwen3-8B-FP8"
11494+
},
11495+
{
11496+
"model_format": "pytorch",
11497+
"model_size_in_billions": 14,
11498+
"quantizations": [
11499+
"4-bit",
11500+
"8-bit",
11501+
"none"
11502+
],
11503+
"model_id": "Qwen/Qwen3-14B"
11504+
},
11505+
{
11506+
"model_format": "fp8",
11507+
"model_size_in_billions": 14,
11508+
"quantizations": [
11509+
"fp8"
11510+
],
11511+
"model_id": "Qwen/Qwen3-14B-FP8"
11512+
},
11513+
{
11514+
"model_format": "pytorch",
11515+
"model_size_in_billions": 30,
11516+
"quantizations": [
11517+
"4-bit",
11518+
"8-bit",
11519+
"none"
11520+
],
11521+
"model_id": "Qwen/Qwen3-30B-A3B"
11522+
},
11523+
{
11524+
"model_format": "fp8",
11525+
"model_size_in_billions": 30,
11526+
"quantizations": [
11527+
"fp8"
11528+
],
11529+
"model_id": "Qwen/Qwen3-30B-FP8"
11530+
},
11531+
{
11532+
"model_format": "pytorch",
11533+
"model_size_in_billions": 32,
11534+
"quantizations": [
11535+
"4-bit",
11536+
"8-bit",
11537+
"none"
11538+
],
11539+
"model_id": "Qwen/Qwen3-32B"
11540+
},
11541+
{
11542+
"model_format": "fp8",
11543+
"model_size_in_billions": 32,
11544+
"quantizations": [
11545+
"fp8"
11546+
],
11547+
"model_id": "Qwen/Qwen3-32B-FP8"
11548+
},
11549+
{
11550+
"model_format": "pytorch",
11551+
"model_size_in_billions": 235,
11552+
"quantizations": [
11553+
"4-bit",
11554+
"8-bit",
11555+
"none"
11556+
],
11557+
"model_id": "Qwen/Qwen3-235B"
11558+
},
11559+
{
11560+
"model_format": "fp8",
11561+
"model_size_in_billions": 235,
11562+
"quantizations": [
11563+
"fp8"
11564+
],
11565+
"model_id": "Qwen/Qwen3-235B-FP8"
11566+
}
11567+
],
11568+
"chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
11569+
"stop_token_ids": [
11570+
151643,
11571+
151644,
11572+
151645
11573+
],
11574+
"stop": [
11575+
"<|endoftext|>",
11576+
"<|im_start|>",
11577+
"<|im_end|>"
11578+
]
1140611579
}
1140711580
]

xinference/model/llm/llm_family_modelscope.json

Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9048,5 +9048,194 @@
90489048
"<|im_start|>",
90499049
"<|im_end|>"
90509050
]
9051+
},
9052+
{
9053+
"version": 1,
9054+
"context_length": 32768,
9055+
"model_name": "qwen3",
9056+
"model_lang": [
9057+
"en",
9058+
"zh"
9059+
],
9060+
"model_ability": [
9061+
"generate",
9062+
"chat",
9063+
"reasoning",
9064+
"tools"
9065+
],
9066+
"model_description": "Qwen3 is the latest generation of large language models in Qwen series, offering a comprehensive suite of dense and mixture-of-experts (MoE) models. Built upon extensive training, Qwen3 delivers groundbreaking advancements in reasoning, instruction-following, agent capabilities, and multilingual support",
9067+
"model_specs": [
9068+
{
9069+
"model_format": "pytorch",
9070+
"model_size_in_billions": "0_6",
9071+
"quantizations": [
9072+
"4-bit",
9073+
"8-bit",
9074+
"none"
9075+
],
9076+
"model_id": "Qwen/Qwen3-0.6B",
9077+
"model_hub": "modelscope"
9078+
},
9079+
{
9080+
"model_format": "fp8",
9081+
"model_size_in_billions": "0_6",
9082+
"quantizations": [
9083+
"fp8"
9084+
],
9085+
"model_id": "Qwen/Qwen3-0.6B-FP8",
9086+
"model_hub": "modelscope"
9087+
},
9088+
{
9089+
"model_format": "pytorch",
9090+
"model_size_in_billions": "1_7",
9091+
"quantizations": [
9092+
"4-bit",
9093+
"8-bit",
9094+
"none"
9095+
],
9096+
"model_id": "Qwen/Qwen3-1.7B",
9097+
"model_hub": "modelscope"
9098+
},
9099+
{
9100+
"model_format": "fp8",
9101+
"model_size_in_billions": "1_7",
9102+
"quantizations": [
9103+
"fp8"
9104+
],
9105+
"model_id": "Qwen/Qwen3-1.7B-FP8",
9106+
"model_hub": "modelscope"
9107+
},
9108+
{
9109+
"model_format": "pytorch",
9110+
"model_size_in_billions": 4,
9111+
"quantizations": [
9112+
"4-bit",
9113+
"8-bit",
9114+
"none"
9115+
],
9116+
"model_id": "Qwen/Qwen3-4B",
9117+
"model_hub": "modelscope"
9118+
},
9119+
{
9120+
"model_format": "fp8",
9121+
"model_size_in_billions": 4,
9122+
"quantizations": [
9123+
"fp8"
9124+
],
9125+
"model_id": "Qwen/Qwen3-4B-FP8",
9126+
"model_hub": "modelscope"
9127+
},
9128+
{
9129+
"model_format": "pytorch",
9130+
"model_size_in_billions": 8,
9131+
"quantizations": [
9132+
"4-bit",
9133+
"8-bit",
9134+
"none"
9135+
],
9136+
"model_id": "Qwen/Qwen3-8B",
9137+
"model_hub": "modelscope"
9138+
},
9139+
{
9140+
"model_format": "fp8",
9141+
"model_size_in_billions": 8,
9142+
"quantizations": [
9143+
"fp8"
9144+
],
9145+
"model_id": "Qwen/Qwen3-8B-FP8",
9146+
"model_hub": "modelscope"
9147+
},
9148+
{
9149+
"model_format": "pytorch",
9150+
"model_size_in_billions": 14,
9151+
"quantizations": [
9152+
"4-bit",
9153+
"8-bit",
9154+
"none"
9155+
],
9156+
"model_id": "Qwen/Qwen3-14B",
9157+
"model_hub": "modelscope"
9158+
},
9159+
{
9160+
"model_format": "fp8",
9161+
"model_size_in_billions": 14,
9162+
"quantizations": [
9163+
"fp8"
9164+
],
9165+
"model_id": "Qwen/Qwen3-14B-FP8",
9166+
"model_hub": "modelscope"
9167+
},
9168+
{
9169+
"model_format": "pytorch",
9170+
"model_size_in_billions": 30,
9171+
"quantizations": [
9172+
"4-bit",
9173+
"8-bit",
9174+
"none"
9175+
],
9176+
"model_id": "Qwen/Qwen3-30B-A3B",
9177+
"model_hub": "modelscope"
9178+
},
9179+
{
9180+
"model_format": "fp8",
9181+
"model_size_in_billions": 30,
9182+
"quantizations": [
9183+
"fp8"
9184+
],
9185+
"model_id": "Qwen/Qwen3-30B-A3B-FP8",
9186+
"model_hub": "modelscope"
9187+
},
9188+
{
9189+
"model_format": "pytorch",
9190+
"model_size_in_billions": 32,
9191+
"quantizations": [
9192+
"4-bit",
9193+
"8-bit",
9194+
"none"
9195+
],
9196+
"model_id": "Qwen/Qwen3-32B",
9197+
"model_hub": "modelscope"
9198+
},
9199+
{
9200+
"model_format": "fp8",
9201+
"model_size_in_billions": 32,
9202+
"quantizations": [
9203+
"fp8"
9204+
],
9205+
"model_id": "Qwen/Qwen3-32B-FP8",
9206+
"model_hub": "modelscope"
9207+
},
9208+
{
9209+
"model_format": "pytorch",
9210+
"model_size_in_billions": 235,
9211+
"quantizations": [
9212+
"4-bit",
9213+
"8-bit",
9214+
"none"
9215+
],
9216+
"model_id": "Qwen/Qwen3-235B",
9217+
"model_hub": "modelscope"
9218+
},
9219+
{
9220+
"model_format": "fp8",
9221+
"model_size_in_billions": 235,
9222+
"quantizations": [
9223+
"fp8"
9224+
],
9225+
"model_id": "Qwen/Qwen3-235B-FP8",
9226+
"model_hub": "modelscope"
9227+
}
9228+
],
9229+
"chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
9230+
"stop_token_ids": [
9231+
151643,
9232+
151644,
9233+
151645
9234+
],
9235+
"stop": [
9236+
"<|endoftext|>",
9237+
"<|im_start|>",
9238+
"<|im_end|>"
9239+
]
90519240
}
90529241
]

xinference/model/llm/vllm/core.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,9 @@ class VLLMGenerateConfig(TypedDict, total=False):
239239
if VLLM_INSTALLED and vllm.__version__ >= "0.8.4":
240240
VLLM_SUPPORTED_CHAT_MODELS.append("glm4-0414")
241241

242+
if VLLM_INSTALLED and vllm.__version__ >= "0.8.5":
243+
VLLM_SUPPORTED_CHAT_MODELS.append("qwen3")
244+
242245

243246
class VLLMModel(LLM):
244247
def __init__(

0 commit comments

Comments
 (0)