huggingface · Cyrilvallez · Sep 9, 2025 · Aug 26, 2025 · Sep 9, 2025 · Sep 9, 2025
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
@@ -657,6 +657,8 @@
         title: Qwen3
       - local: model_doc/qwen3_moe
         title: Qwen3MoE
+      - local: model_doc/qwen3_next
+        title: Qwen3Next
       - local: model_doc/rag
         title: RAG
       - local: model_doc/realm

diff --git a/docs/source/en/model_doc/qwen3_next.md b/docs/source/en/model_doc/qwen3_next.md
@@ -0,0 +1,97 @@
+<!--Copyright 2025 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+## Overview
+
+The Qwen3-Next series represents our next-generation foundation models, optimized for extreme context length and large-scale parameter efficiency. 
+The series introduces a suite of architectural innovations designed to maximize performance while minimizing computational cost:
+- **Hybrid Attention**: Replaces standard attention with the combination of **Gated DeltaNet** and **Gated Attention**, enabling efficient context modeling.  
+- **High-Sparsity MoE**: Achieves an extreme low activation ratio as 1:50 in MoE layers — drastically reducing FLOPs per token while preserving model capacity.
+- **Multi-Token Prediction(MTP)**: Boosts pretraining model performance, and accelerates inference.
+- **Other Optimizations**: Includes techniques such as **zero-centered and weight-decayed layernorm**, **Gated Attention**, and other stabilizing enhancements for robust training.  
+
+Built on this architecture, we trained and open-sourced Qwen3-Next-80B-A3B — 80B total parameters, only 3B active — achieving extreme sparsity and efficiency.
+
+Despite its ultra-efficiency, it outperforms Qwen3-32B on downstream tasks — while requiring **less than 1/10 of the training cost**. 
+Moreover, it delivers over **10x higher inference throughput** than Qwen3-32B when handling contexts longer than 32K tokens.
+
+For more details, please visit our blog [Qwen3-Next](qwen3_next) ([blog post](https://qwenlm.github.io/blog/qwen3_next/)).
+## Usage examples
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_name = "Qwen/Qwen3-Next-80B-A3B-Instruct"
+
+# load the tokenizer and the model
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    dtype="auto",
+    device_map="auto"
+)
+
+# prepare the model input
+prompt = "Give me a short introduction to large language model."
+messages = [
+    {"role": "user", "content": prompt}
+]
+text = tokenizer.apply_chat_template(
+    messages,
+    tokenize=False,
+    add_generation_prompt=True,
+)
+model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
+
+# conduct text completion
+generated_ids = model.generate(
+    **model_inputs,
+    max_new_tokens=512
+)
+output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() 
+
+content = tokenizer.decode(output_ids, skip_special_tokens=True)
+
+print("content:", content)
+```
+
+## Qwen3NextConfig
+
+[[autodoc]] Qwen3NextConfig
+
+## Qwen3NextModel
+
+[[autodoc]] Qwen3NextModel
+    - forward
+
+## Qwen3NextForCausalLM
+
+[[autodoc]] Qwen3NextForCausalLM
+    - forward
+
+## Qwen3NextForSequenceClassification
+
+[[autodoc]] Qwen3NextForSequenceClassification
+    - forward
+
+## Qwen3NextForQuestionAnswering
+
+[[autodoc]] Qwen3NextForQuestionAnswering
+    - forward
+
+## Qwen3NextForTokenClassification
+
+[[autodoc]] Qwen3NextForTokenClassification
+    - forward
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
@@ -276,6 +276,7 @@
     from .qwen2_vl import *
     from .qwen3 import *
     from .qwen3_moe import *
+    from .qwen3_next import *
     from .rag import *
     from .recurrent_gemma import *
     from .reformer import *

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
@@ -323,6 +323,7 @@
         ("qwen2_vl_text", "Qwen2VLTextConfig"),
         ("qwen3", "Qwen3Config"),
         ("qwen3_moe", "Qwen3MoeConfig"),
+        ("qwen3_next", "Qwen3NextConfig"),
         ("rag", "RagConfig"),
         ("realm", "RealmConfig"),
         ("recurrent_gemma", "RecurrentGemmaConfig"),
@@ -759,6 +760,7 @@
         ("qwen2_vl_text", "Qwen2VL"),
         ("qwen3", "Qwen3"),
         ("qwen3_moe", "Qwen3MoE"),
+        ("qwen3_next", "Qwen3Next"),
         ("rag", "RAG"),
         ("realm", "REALM"),
         ("recurrent_gemma", "RecurrentGemma"),

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
@@ -317,6 +317,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("qwen2_vl_text", "Qwen2VLTextModel"),
         ("qwen3", "Qwen3Model"),
         ("qwen3_moe", "Qwen3MoeModel"),
+        ("qwen3_next", "Qwen3NextModel"),
         ("recurrent_gemma", "RecurrentGemmaModel"),
         ("reformer", "ReformerModel"),
         ("regnet", "RegNetModel"),
@@ -713,6 +714,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("qwen2_moe", "Qwen2MoeForCausalLM"),
         ("qwen3", "Qwen3ForCausalLM"),
         ("qwen3_moe", "Qwen3MoeForCausalLM"),
+        ("qwen3_next", "Qwen3NextForCausalLM"),
         ("recurrent_gemma", "RecurrentGemmaForCausalLM"),
         ("reformer", "ReformerModelWithLMHead"),
         ("rembert", "RemBertForCausalLM"),
@@ -1263,6 +1265,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("qwen2_moe", "Qwen2MoeForSequenceClassification"),
         ("qwen3", "Qwen3ForSequenceClassification"),
         ("qwen3_moe", "Qwen3MoeForSequenceClassification"),
+        ("qwen3_next", "Qwen3NextForSequenceClassification"),
         ("reformer", "ReformerForSequenceClassification"),
         ("rembert", "RemBertForSequenceClassification"),
         ("roberta", "RobertaForSequenceClassification"),
@@ -1352,6 +1355,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("qwen2_moe", "Qwen2MoeForQuestionAnswering"),
         ("qwen3", "Qwen3ForQuestionAnswering"),
         ("qwen3_moe", "Qwen3MoeForQuestionAnswering"),
+        ("qwen3_next", "Qwen3NextForQuestionAnswering"),
         ("reformer", "ReformerForQuestionAnswering"),
         ("rembert", "RemBertForQuestionAnswering"),
         ("roberta", "RobertaForQuestionAnswering"),
@@ -1467,6 +1471,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("qwen2_moe", "Qwen2MoeForTokenClassification"),
         ("qwen3", "Qwen3ForTokenClassification"),
         ("qwen3_moe", "Qwen3MoeForTokenClassification"),
+        ("qwen3_next", "Qwen3NextForTokenClassification"),
         ("rembert", "RemBertForTokenClassification"),
         ("roberta", "RobertaForTokenClassification"),
         ("roberta-prelayernorm", "RobertaPreLayerNormForTokenClassification"),

diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
@@ -575,6 +575,13 @@
                 "Qwen2TokenizerFast" if is_tokenizers_available() else None,
             ),
         ),
+        (
+            "qwen3_next",
+            (
+                "Qwen2Tokenizer",
+                "Qwen2TokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
         ("rag", ("RagTokenizer", None)),
         ("realm", ("RealmTokenizer", "RealmTokenizerFast" if is_tokenizers_available() else None)),
         (

diff --git a/src/transformers/models/qwen3_next/__init__.py b/src/transformers/models/qwen3_next/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_qwen3_next import *
+    from .modeling_qwen3_next import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)