diff --git a/.gitignore b/.gitignore
index e38e2dc747..1c15c43954 100644
--- a/.gitignore
+++ b/.gitignore
@@ -162,9 +162,6 @@ docker/
 # scripts
 /scripts/
 
-# tests
-tests/
-
 # PyCharm
 #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
 #  be added to the global gitignore or merged into this project gitignore.  For a PyCharm
diff --git a/docs/.nav.yml b/docs/.nav.yml
index fb9c7fb443..7bada971c6 100644
--- a/docs/.nav.yml
+++ b/docs/.nav.yml
@@ -1,44 +1,46 @@
 nav:
-  - Home: README.md
-  - User Guide:
-    - Getting Started:
-      - getting_started/quickstart.md
-      - getting_started/installation
-    - Examples:
-      - examples/README.md
-      - Offline Inference:
-        - Qwen2.5-Omni: user_guide/examples/offline_inference/qwen2_5_omni.md
-        - Qwen2.5-Image: user_guide/examples/offline_inference/qwen_image.md
-      - Online Serving:
-        - Qwen2.5-Omni: user_guide/examples/online_serving/qwen2_5_omni.md
-    - General:
-      - usage/*
-    - Configuration:
-      - configuration/README.md
-      - configuration/*
-    - Models:
-      - models/supported_models.md
-  - Developer Guide:
-    - General:
-      - contributing/README.md
-      - glob: contributing/*
-        flatten_single_child_sections: true
-    - Model Implementation:
-      - contributing/model/README.md
-    - CI: contributing/ci
-    - Design Documents:
-      - design/index.md
-      - design/architecture_overview.md
-      - design/vllm_omni_design.md
-      - design/mrs_design.md
-      - design/api_design_doc.md
-    - Docs Guide: contributing/DOCS_GUIDE.md
-  - API Reference:
-    - api/README.md
-    - api/vllm_omni
-  - CLI Reference: cli
-  - Community:
-    - community/*
-    - Slack: https://slack.vllm.ai
-    - Blog: https://blog.vllm.ai
-    - Forum: https://discuss.vllm.ai
+- Home: README.md
+- User Guide:
+  - Getting Started:
+    - getting_started/quickstart.md
+    - getting_started/installation
+  - Examples:
+    - examples/README.md
+    - Offline Inference:
+      - Offline Example of vLLM-Omni for Qwen2.5-omni: user_guide/examples/offline_inference/qwen2_5_omni.md
+      - Offline Example of vLLM-Omni for Qwen3-omni: user_guide/examples/offline_inference/qwen3_omni.md
+      - Qwen-Image Offline Inference: user_guide/examples/offline_inference/qwen_image.md
+    - Online Serving:
+      - Online serving Example of vLLM-Omni for Qwen2.5-omni: user_guide/examples/online_serving/qwen2_5_omni.md
+      - Online serving Example of vLLM-Omni for Qwen3-omni: user_guide/examples/online_serving/qwen3_omni.md
+  - General:
+    - usage/*
+  - Configuration:
+    - configuration/README.md
+    - configuration/*
+  - Models:
+    - models/supported_models.md
+- Developer Guide:
+  - General:
+    - contributing/README.md
+    - glob: contributing/*
+      flatten_single_child_sections: true
+  - Model Implementation:
+    - contributing/model/README.md
+  - CI: contributing/ci
+  - Design Documents:
+    - design/index.md
+    - design/architecture_overview.md
+    - design/vllm_omni_design.md
+    - design/mrs_design.md
+    - design/api_design_doc.md
+  - Docs Guide: contributing/DOCS_GUIDE.md
+- API Reference:
+  - api/README.md
+  - api/vllm_omni
+- CLI Reference: cli
+- Community:
+  - community/*
+  - Slack: https://slack.vllm.ai
+  - Blog: https://blog.vllm.ai
+  - Forum: https://discuss.vllm.ai
diff --git a/docs/api/README.md b/docs/api/README.md
index 100e17ad44..bb4e9d8777 100644
--- a/docs/api/README.md
+++ b/docs/api/README.md
@@ -34,6 +34,8 @@ Engine classes for offline and online inference.
 - [vllm_omni.diffusion.diffusion_engine.DiffusionEngine][]
 - [vllm_omni.engine.AdditionalInformationEntry][]
 - [vllm_omni.engine.AdditionalInformationPayload][]
+- [vllm_omni.engine.OmniEngineCoreOutput][]
+- [vllm_omni.engine.OmniEngineCoreOutputs][]
 - [vllm_omni.engine.OmniEngineCoreRequest][]
 - [vllm_omni.engine.PromptEmbedsPayload][]
 - [vllm_omni.engine.arg_utils.AsyncOmniEngineArgs][]
@@ -48,6 +50,7 @@ Core scheduling and caching components.
 
 - [vllm_omni.core.dit_cache_manager.DiTCacheManager][]
 - [vllm_omni.core.sched.diffusion_scheduler.DiffusionScheduler][]
+- [vllm_omni.core.sched.generation_scheduler.GenerationScheduler][]
 - [vllm_omni.core.sched.output.OmniNewRequestData][]
 - [vllm_omni.core.sched.scheduler.OmniScheduler][]
 
@@ -55,7 +58,7 @@ Core scheduling and caching components.
 
 Model execution components.
 
-- [vllm_omni.model_executor.models.qwen2_5_omni.qwen2_5_omni.OmniOutput][]
+- [vllm_omni.model_executor.models.output_templates.OmniOutput][]
 - [vllm_omni.model_executor.models.qwen2_5_omni.qwen2_5_omni.Qwen2_5OmniForConditionalGeneration][]
 - [vllm_omni.model_executor.models.qwen2_5_omni.qwen2_5_omni_talker.Qwen2_5OmniTalkerForConditionalGeneration][]
 - [vllm_omni.model_executor.models.qwen2_5_omni.qwen2_5_omni_thinker.Qwen2_5OmniAudioFeatureInputs][]
@@ -72,6 +75,22 @@ Model execution components.
 - [vllm_omni.model_executor.models.qwen2_5_omni.qwen2_old.Qwen2EmbeddingModel][]
 - [vllm_omni.model_executor.models.qwen2_5_omni.qwen2_old.Qwen2ForCausalLM][]
 - [vllm_omni.model_executor.models.qwen2_5_omni.qwen2_old.Qwen2Model][]
+- [vllm_omni.model_executor.models.qwen3_omni.qwen3_moe.Qwen3MoeForCausalLM][]
+- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni.Qwen3OmniMoeForConditionalGeneration][]
+- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_code2wav.Qwen3OmniMoeCode2Wav][]
+- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_code_predictor_mtp.Qwen3OmniCodePredictorBaseModel][]
+- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_code_predictor_mtp.Qwen3OmniMoeTalkerCodePredictor][]
+- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_talker.Qwen3OmniMoeModel][]
+- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_talker.Qwen3OmniMoeTalkerForConditionalGeneration][]
+- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_thinker.Qwen3MoeLLMForCausalLM][]
+- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_thinker.Qwen3MoeLLMModel][]
+- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_thinker.Qwen3OmniMoeConditionalGenerationMixin][]
+- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_thinker.Qwen3OmniMoeThinkerForConditionalGeneration][]
+- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_thinker.Qwen3OmniMoeThinkerMultiModalProcessor][]
+- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_thinker.Qwen3OmniMoeThinkerProcessingInfo][]
+- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_thinker.Qwen3Omni_VisionTransformer][]
+- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_thinker.Qwen3_VisionPatchEmbed][]
+- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_thinker.Qwen3_VisionPatchMerger][]
 
 ## Configuration
 
@@ -89,4 +108,6 @@ Worker classes and model runners for distributed inference.
 - [vllm_omni.worker.gpu_ar_worker.GPUARWorker][]
 - [vllm_omni.worker.gpu_diffusion_model_runner.GPUDiffusionModelRunner][]
 - [vllm_omni.worker.gpu_diffusion_worker.GPUDiffusionWorker][]
+- [vllm_omni.worker.gpu_generation_model_runner.GPUGenerationModelRunner][]
+- [vllm_omni.worker.gpu_generation_worker.GPUGenerationWorker][]
 - [vllm_omni.worker.gpu_model_runner.OmniGPUModelRunner][]
diff --git a/docs/mkdocs/hooks/generate_examples.py b/docs/mkdocs/hooks/generate_examples.py
new file mode 100644
index 0000000000..46bea983f2
--- /dev/null
+++ b/docs/mkdocs/hooks/generate_examples.py
@@ -0,0 +1,335 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import itertools
+import logging
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Literal
+
+import regex as re
+import yaml
+
+logger = logging.getLogger("mkdocs")
+
+ROOT_DIR = Path(__file__).parent.parent.parent.parent
+ROOT_DIR_RELATIVE = "../../../../.."
+EXAMPLE_DIR = ROOT_DIR / "examples"
+EXAMPLE_DOC_DIR = ROOT_DIR / "docs/user_guide/examples"
+NAV_FILE = ROOT_DIR / "docs/.nav.yml"
+
+
+def fix_case(text: str) -> str:
+    subs = {
+        "api": "API",
+        "cli": "CLI",
+        "cpu": "CPU",
+        "llm": "LLM",
+        "mae": "MAE",
+        "tpu": "TPU",
+        "gguf": "GGUF",
+        "lora": "LoRA",
+        "rlhf": "RLHF",
+        "vllm": "vLLM",
+        "openai": "OpenAI",
+        "lmcache": "LMCache",
+        "multilora": "MultiLoRA",
+        "mlpspeculator": "MLPSpeculator",
+        r"fp\d+": lambda x: x.group(0).upper(),  # e.g. fp16, fp32
+        r"int\d+": lambda x: x.group(0).upper(),  # e.g. int8, int16
+    }
+    for pattern, repl in subs.items():
+        text = re.sub(rf"\b{pattern}\b", repl, text, flags=re.IGNORECASE)
+    return text
+
+
+@dataclass
+class Example:
+    """
+    Example class for generating documentation content from a given path.
+
+    Attributes:
+        path (Path): The path to the main directory or file.
+        category (str): The category of the document.
+        main_file (Path): The main file in the directory.
+        other_files (list[Path]): list of other files in the directory.
+        title (str): The title of the document.
+
+    Methods:
+        __post_init__(): Initializes the main_file, other_files, and title attributes.
+        determine_main_file() -> Path: Determines the main file in the given path.
+        determine_other_files() -> list[Path]: Determines other files in the directory excluding the main file.
+        determine_title() -> str: Determines the title of the document.
+        generate() -> str: Generates the documentation content.
+    """  # noqa: E501
+
+    path: Path
+    category: str = None
+    main_file: Path = field(init=False)
+    other_files: list[Path] = field(init=False)
+    title: str = field(init=False)
+
+    def __post_init__(self):
+        self.main_file = self.determine_main_file()
+        self.other_files = self.determine_other_files()
+        self.title = self.determine_title()
+
+    @property
+    def is_code(self) -> bool:
+        return self.main_file.suffix != ".md"
+
+    def determine_main_file(self) -> Path:
+        """
+        Determines the main file in the given path.
+        If the path is a file, it returns the path itself. Otherwise, it searches
+        for Markdown files (*.md) in the directory and returns the first one found.
+        Returns:
+            Path: The main file path, either the original path if it's a file or the first
+            Markdown file found in the directory.
+        Raises:
+            IndexError: If no Markdown files are found in the directory.
+        """  # noqa: E501
+        return self.path if self.path.is_file() else list(self.path.glob("*.md")).pop()
+
+    def determine_other_files(self) -> list[Path]:
+        """
+        Determine other files in the directory excluding the main file.
+
+        This method checks if the given path is a file. If it is, it returns an empty list.
+        Otherwise, it recursively searches through the directory and returns a list of all
+        files that are not the main file.
+
+        Returns:
+            list[Path]: A list of Path objects representing the other files in the directory.
+        """  # noqa: E501
+        if self.path.is_file():
+            return []
+        # Binary file extensions to exclude
+        binary_extensions = {
+            ".wav",
+            ".mp3",
+            ".mp4",
+            ".avi",
+            ".mov",
+            ".mkv",  # Audio/Video
+            ".png",
+            ".jpg",
+            ".jpeg",
+            ".gif",
+            ".bmp",
+            ".ico",
+            ".svg",  # Images
+            ".pdf",
+            ".zip",
+            ".tar",
+            ".gz",
+            ".bz2",
+            ".xz",  # Archives/Documents
+            ".exe",
+            ".so",
+            ".dll",
+            ".dylib",  # Binaries
+            ".bin",
+            ".dat",
+            ".db",
+            ".sqlite",  # Data files
+        }
+
+        def is_other_file(file: Path) -> bool:
+            return file.is_file() and file != self.main_file and file.suffix.lower() not in binary_extensions
+
+        return [file for file in self.path.rglob("*") if is_other_file(file)]
+
+    def determine_title(self) -> str:
+        if not self.is_code:
+            # Specify encoding for building on Windows
+            with open(self.main_file, encoding="utf-8") as f:
+                first_line = f.readline().strip()
+            match = re.match(r"^#\s+(?P<title>.+)$", first_line)
+            if match:
+                return match.group("title")
+        return fix_case(self.path.stem.replace("_", " ").title())
+
+    def fix_relative_links(self, content: str) -> str:
+        """
+        Fix relative links in markdown content by converting them to gh-file
+        format.
+
+        Args:
+            content (str): The markdown content to process
+
+        Returns:
+            str: Content with relative links converted to gh-file format
+        """
+        # Regex to match markdown links [text](relative_path)
+        # This matches links that don't start with http, https, ftp, or #
+        link_pattern = r"\[([^\]]*)\]\((?!(?:https?|ftp)://|#)([^)]+)\)"
+
+        def replace_link(match):
+            link_text = match.group(1)
+            relative_path = match.group(2)
+
+            # Make relative to repo root
+            gh_file = (self.main_file.parent / relative_path).resolve()
+            gh_file = gh_file.relative_to(ROOT_DIR)
+
+            # Make GitHub URL
+            url = "https://github.com/vllm-project/vllm/"
+            url += "tree/main" if self.path.is_dir() else "blob/main"
+            gh_url = f"{url}/{gh_file}"
+
+            return f"[{link_text}]({gh_url})"
+
+        return re.sub(link_pattern, replace_link, content)
+
+    def generate(self) -> str:
+        content = f"# {self.title}\n\n"
+        url = "https://github.com/vllm-project/vllm/"
+        url += "tree/main" if self.path.is_dir() else "blob/main"
+        content += f"Source <{url}/{self.path.relative_to(ROOT_DIR)}>.\n\n"
+
+        # Use long code fence to avoid issues with
+        # included files containing code fences too
+        code_fence = "``````"
+
+        if self.is_code:
+            content += f'{code_fence}{self.main_file.suffix[1:]}\n--8<-- "{self.main_file}"\n{code_fence}\n'
+        else:
+            with open(self.main_file) as f:
+                # Skip the title from md snippets as it's been included above
+                main_content = f.readlines()[1:]
+            content += self.fix_relative_links("".join(main_content))
+        content += "\n"
+
+        if not self.other_files:
+            return content
+
+        content += "## Example materials\n\n"
+        for file in sorted(self.other_files):
+            content += f'??? abstract "{file.relative_to(self.path)}"\n'
+            if file.suffix != ".md":
+                content += f"    {code_fence}{file.suffix[1:]}\n"
+            content += f'    --8<-- "{file}"\n'
+            if file.suffix != ".md":
+                content += f"    {code_fence}\n"
+
+        return content
+
+
+def update_nav_file(examples: list[Example]):
+    """
+    Update the .nav.yml file to include all generated examples.
+    This function completely regenerates the examples section based on the actual
+    folder structure, ensuring consistency between the examples folder and nav file.
+
+    Args:
+        examples: List of Example objects that have been generated
+    """
+    if not NAV_FILE.exists():
+        logger.warning("Navigation file not found: %s", NAV_FILE)
+        return
+
+    # Read the current nav file
+    with open(NAV_FILE, encoding="utf-8") as f:
+        nav_data = yaml.safe_load(f) or {}
+
+    nav_list = nav_data.get("nav", [])
+
+    # Find the "User Guide" section
+    user_guide_idx = None
+    examples_idx = None
+    for i, item in enumerate(nav_list):
+        if isinstance(item, dict) and "User Guide" in item:
+            user_guide_idx = i
+            user_guide_content = item["User Guide"]
+            # Find the "Examples" subsection
+            for j, subitem in enumerate(user_guide_content):
+                if isinstance(subitem, dict) and "Examples" in subitem:
+                    examples_idx = j
+                    break
+            break
+
+    if user_guide_idx is None or examples_idx is None:
+        logger.warning("Could not find 'User Guide' -> 'Examples' section in nav file")
+        return
+
+    # Get existing Examples section to preserve non-example items (like README.md)
+    existing_examples_content = nav_list[user_guide_idx]["User Guide"][examples_idx]["Examples"]
+
+    # Preserve string items (like "examples/README.md") that are not example categories
+    preserved_items = [
+        item
+        for item in existing_examples_content
+        if isinstance(item, str) and not item.startswith("user_guide/examples/")
+    ]
+
+    # Group examples by category
+    examples_by_category = {}
+    for example in examples:
+        category = example.category
+        if category not in examples_by_category:
+            examples_by_category[category] = []
+        examples_by_category[category].append(example)
+
+    # Build the new Examples section - start with preserved items
+    examples_section = preserved_items.copy()
+
+    # Add examples grouped by category, sorted by category name
+    for category in sorted(examples_by_category.keys()):
+        category_examples = sorted(examples_by_category[category], key=lambda e: e.path.stem)
+        category_items = []
+        for example in category_examples:
+            doc_path = EXAMPLE_DOC_DIR / example.category / f"{example.path.stem}.md"
+            rel_path = doc_path.relative_to(ROOT_DIR / "docs")
+            category_items.append({example.title: str(rel_path)})
+
+        if category_items:
+            # Format category name (e.g., "offline_inference" -> "Offline Inference")
+            category_title = fix_case(category.replace("_", " ").title())
+            examples_section.append({category_title: category_items})
+
+    # Update the nav structure
+    nav_list[user_guide_idx]["User Guide"][examples_idx]["Examples"] = examples_section
+
+    # Write back to file
+    nav_data["nav"] = nav_list
+    with open(NAV_FILE, "w", encoding="utf-8") as f:
+        yaml.dump(nav_data, f, default_flow_style=False, sort_keys=False, allow_unicode=True)
+    logger.info("Updated navigation file: %s", NAV_FILE.relative_to(ROOT_DIR))
+
+
+def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
+    logger.info("Generating example documentation")
+    logger.debug("Root directory: %s", ROOT_DIR.resolve())
+    logger.debug("Example directory: %s", EXAMPLE_DIR.resolve())
+    logger.debug("Example document directory: %s", EXAMPLE_DOC_DIR.resolve())
+
+    # Create the EXAMPLE_DOC_DIR if it doesn't exist
+    if not EXAMPLE_DOC_DIR.exists():
+        EXAMPLE_DOC_DIR.mkdir(parents=True)
+
+    categories = sorted(p for p in EXAMPLE_DIR.iterdir() if p.is_dir())
+
+    examples = []
+    glob_patterns = ["*.py", "*.md", "*.sh"]
+    # Find categorised examples
+    for category in categories:
+        globs = [category.glob(pattern) for pattern in glob_patterns]
+        for path in itertools.chain(*globs):
+            examples.append(Example(path, category.stem))
+        # Find examples in subdirectories
+        for path in category.glob("*/*.md"):
+            examples.append(Example(path.parent, category.stem))
+
+    # Generate the example documentation
+    for example in sorted(examples, key=lambda e: e.path.stem):
+        example_name = f"{example.path.stem}.md"
+        doc_path = EXAMPLE_DOC_DIR / example.category / example_name
+        if not doc_path.parent.exists():
+            doc_path.parent.mkdir(parents=True)
+        # Specify encoding for building on Windows
+        with open(doc_path, "w+", encoding="utf-8") as f:
+            f.write(example.generate())
+        logger.debug("Example generated: %s", doc_path.relative_to(ROOT_DIR))
+
+    # Update the navigation file
+    update_nav_file(examples)
diff --git a/docs/user_guide/examples/offline_inference/qwen2_5_omni.md b/docs/user_guide/examples/offline_inference/qwen2_5_omni.md
index c11360bcfe..7ec3ed3afe 100644
--- a/docs/user_guide/examples/offline_inference/qwen2_5_omni.md
+++ b/docs/user_guide/examples/offline_inference/qwen2_5_omni.md
@@ -1,55 +1,40 @@
-# Offline Inference Example of vLLM-Omni for Qwen2.5-Omni
+# Offline Example of vLLM-Omni for Qwen2.5-omni
 
-Source <https://github.com/vllm-project/vllm-omni/tree/main/examples/offline_inference/qwen2_5_omni>.
+Source <https://github.com/vllm-project/vllm/tree/main/examples/offline_inference/qwen2_5_omni>.
 
 
 ## 🛠️ Installation
 
-Please refer to [installation](../../../getting_started/installation/README.md).
-
-## Run Offline inference with Qwen2.5-Omni.
-First, navigate to the example folder
-```bash
-cd examples/offline_inference/qwen2_5_omni
-```
-Inside the directory, `end2end.py` is a comprehensive demo suite for initializing a model instance of `Qwen/Qwen2.5-Omni-7B` and use it for various offline inference tasks.
-??? abstract "end2end.py"
-    ``````py
-    --8<-- "examples/offline_inference/qwen2_5_omni/end2end.py"
-    ``````
-
-Below we also provide simple bash scripts to execute this file.
-### Single Prompt
-```bash
-bash run_single_prompt.sh
-```
-??? abstract "run_single_prompt.sh"
-    ``````sh
-    --8<-- "examples/offline_inference/qwen2_5_omni/run_single_prompt.sh"
-    ``````
+Please refer to [README.md](https://github.com/vllm-project/vllm/tree/main/README.md)
 
+## Run examples (Qwen2.5-omni)
 ### Multiple Prompts
-Download dataset from [seed_tts](https://drive.google.com/file/d/1GlSjVfSHkW3-leKKBlfrjuuTGqQ_xaLP/edit) and extract the prompts with `extract_prompts.py`
-??? abstract "extract_prompts.py"
-    ``````py
-    --8<-- "examples/offline_inference/qwen2_5_omni/extract_prompts.py"
-    ``````
+Download dataset from [seed_tts](https://drive.google.com/file/d/1GlSjVfSHkW3-leKKBlfrjuuTGqQ_xaLP/edit). To get the prompt, you can:
 ```bash
 tar -xf <Your Download Path>/seedtts_testset.tar
 cp seedtts_testset/en/meta.lst examples/offline_inference/qwen2_5_omni/meta.lst
 python3 examples/offline_inference/qwen2_5_omni/extract_prompts.py \
   --input examples/offline_inference/qwen2_5_omni/meta.lst \
   --output examples/offline_inference/qwen2_5_omni/top100.txt \
-  --topk 10
+  --topk 100
+```
+Get into the example folder
+```bash
+cd examples/offline_inference/qwen2_5_omni
 ```
 Then run the command below.
 ```bash
 bash run_multiple_prompts.sh
 ```
-??? abstract "run_multiple_prompts.sh"
-    ``````sh
-    --8<-- "examples/offline_inference/qwen2_5_omni/run_multiple_prompts.sh"
-    ``````
+### Single Prompt
+Get into the example folder
+```bash
+cd examples/offline_inference/qwen2_5_omni
+```
+Then run the command below.
+```bash
+bash run_single_prompt.sh
+```
 
 ### FAQ
 
@@ -58,3 +43,22 @@ If you encounter error about backend of librosa, try to install ffmpeg with comm
 sudo apt update
 sudo apt install ffmpeg
 ```
+
+## Example materials
+
+??? abstract "end2end.py"
+    ``````py
+    --8<-- "/mnt/vllm_open_release/vllm-omni-cursor/vllm-omni/examples/offline_inference/qwen2_5_omni/end2end.py"
+    ``````
+??? abstract "extract_prompts.py"
+    ``````py
+    --8<-- "/mnt/vllm_open_release/vllm-omni-cursor/vllm-omni/examples/offline_inference/qwen2_5_omni/extract_prompts.py"
+    ``````
+??? abstract "run_multiple_prompts.sh"
+    ``````sh
+    --8<-- "/mnt/vllm_open_release/vllm-omni-cursor/vllm-omni/examples/offline_inference/qwen2_5_omni/run_multiple_prompts.sh"
+    ``````
+??? abstract "run_single_prompt.sh"
+    ``````sh
+    --8<-- "/mnt/vllm_open_release/vllm-omni-cursor/vllm-omni/examples/offline_inference/qwen2_5_omni/run_single_prompt.sh"
+    ``````
diff --git a/docs/user_guide/examples/offline_inference/qwen3_omni.md b/docs/user_guide/examples/offline_inference/qwen3_omni.md
new file mode 100644
index 0000000000..58ea77b2b3
--- /dev/null
+++ b/docs/user_guide/examples/offline_inference/qwen3_omni.md
@@ -0,0 +1,64 @@
+# Offline Example of vLLM-Omni for Qwen3-omni
+
+Source <https://github.com/vllm-project/vllm/tree/main/examples/offline_inference/qwen3_omni>.
+
+
+## 🛠️ Installation
+
+Please refer to [README.md](https://github.com/vllm-project/vllm/tree/main/README.md)
+
+## Run examples (Qwen3-omni)
+### Multiple Prompts
+Download dataset from [seed_tts](https://drive.google.com/file/d/1GlSjVfSHkW3-leKKBlfrjuuTGqQ_xaLP/edit). For processing dataset please refer to [Qwen2.5-omni README.md](https://github.com/vllm-project/vllm/tree/main/examples/offline_inference/qwen2_5_omni/README.md)
+Get into the example folder
+```bash
+cd examples/offline_inference/qwen3_omni
+```
+Then run the command below.
+```bash
+bash run_multiple_prompts.sh
+```
+### Single Prompt
+Get into the example folder
+```bash
+cd examples/offline_inference/qwen3_omni
+```
+Then run the command below.
+```bash
+bash run_single_prompt.sh
+```
+If you have not enough memory, you can set thinker with tensor parallel. Just run the command below.
+```bash
+bash run_single_prompt_tp.sh
+```
+
+### FAQ
+
+If you encounter error about backend of librosa, try to install ffmpeg with command below.
+```
+sudo apt update
+sudo apt install ffmpeg
+```
+
+## Example materials
+
+??? abstract "end2end.py"
+    ``````py
+    --8<-- "/mnt/vllm_open_release/vllm-omni-cursor/vllm-omni/examples/offline_inference/qwen3_omni/end2end.py"
+    ``````
+??? abstract "qwen3_omni_moe_tp.yaml"
+    ``````yaml
+    --8<-- "/mnt/vllm_open_release/vllm-omni-cursor/vllm-omni/examples/offline_inference/qwen3_omni/qwen3_omni_moe_tp.yaml"
+    ``````
+??? abstract "run_multiple_prompts.sh"
+    ``````sh
+    --8<-- "/mnt/vllm_open_release/vllm-omni-cursor/vllm-omni/examples/offline_inference/qwen3_omni/run_multiple_prompts.sh"
+    ``````
+??? abstract "run_single_prompt.sh"
+    ``````sh
+    --8<-- "/mnt/vllm_open_release/vllm-omni-cursor/vllm-omni/examples/offline_inference/qwen3_omni/run_single_prompt.sh"
+    ``````
+??? abstract "run_single_prompt_tp.sh"
+    ``````sh
+    --8<-- "/mnt/vllm_open_release/vllm-omni-cursor/vllm-omni/examples/offline_inference/qwen3_omni/run_single_prompt_tp.sh"
+    ``````
diff --git a/docs/user_guide/examples/offline_inference/qwen_image.md b/docs/user_guide/examples/offline_inference/qwen_image.md
index 904a2c0910..ce2c29b2a1 100644
--- a/docs/user_guide/examples/offline_inference/qwen_image.md
+++ b/docs/user_guide/examples/offline_inference/qwen_image.md
@@ -1,25 +1,15 @@
-# Offline Inference Example of vLLM-Omni for Qwen-Image
+# Qwen-Image Offline Inference
 
-Source <https://github.com/vllm-project/vllm-omni/tree/main/examples/offline_inference/qwen_image>.
+Source <https://github.com/vllm-project/vllm/tree/main/examples/offline_inference/qwen_image>.
 
 
-## 🛠️ Installation
-
-Please refer to [installation](../../../getting_started/installation/README.md).
-
-## Run Offline inference with Qwen-Image
-
-First, navigate to the example folder
-```bash
-cd examples/offline_inference/qwen_image
-```
-
 This folder provides two simple entrypoints for experimenting with `Qwen/Qwen-Image` using vLLM-Omni:
 
 - `text_to_image.py`: command-line script for single image generation.
-- `gradio_demo.py`: lightweight Gradio UI for interactive prompt/seed/CFG exploration.
+- `web_demo.py`: lightweight Gradio UI for interactive prompt/seed/CFG exploration.
 
-### Command-line Usage
+
+## Local CLI Usage
 
 ```bash
 python text_to_image.py \
@@ -43,14 +33,9 @@ Key arguments:
 - `--height/--width`: output resolution (defaults 1024x1024).
 - `--output`: path to save the generated PNG.
 
-??? abstract "text_to_image.py"
-    ``````py
-    --8<-- "examples/offline_inference/qwen_image/text_to_image.py"
-    ``````
-
 > ℹ️ Qwen-Image currently publishes best-effort presets at `1328x1328`, `1664x928`, `928x1664`, `1472x1140`, `1140x1472`, `1584x1056`, and `1056x1584`. Adjust `--height/--width` accordingly for the most reliable outcomes.
 
-### Web UI Demo
+## Web UI Demo
 
 Launch the gradio demo:
 
@@ -60,7 +45,13 @@ python gradio_demo.py --port 7862
 
 Then open `http://localhost:7862/` on your local browser to interact with the web UI.
 
+## Example materials
+
 ??? abstract "gradio_demo.py"
     ``````py
-    --8<-- "examples/offline_inference/qwen_image/gradio_demo.py"
+    --8<-- "/mnt/vllm_open_release/vllm-omni-cursor/vllm-omni/examples/offline_inference/qwen_image/gradio_demo.py"
+    ``````
+??? abstract "text_to_image.py"
+    ``````py
+    --8<-- "/mnt/vllm_open_release/vllm-omni-cursor/vllm-omni/examples/offline_inference/qwen_image/text_to_image.py"
     ``````
diff --git a/docs/user_guide/examples/online_serving/qwen2_5_omni.md b/docs/user_guide/examples/online_serving/qwen2_5_omni.md
index 96c283d168..7918c5f841 100644
--- a/docs/user_guide/examples/online_serving/qwen2_5_omni.md
+++ b/docs/user_guide/examples/online_serving/qwen2_5_omni.md
@@ -1,43 +1,38 @@
-# Online Serving Example of vLLM-Omni for Qwen2.5-Omni
+# Online serving Example of vLLM-Omni for Qwen2.5-omni
 
-Source <https://github.com/vllm-project/vllm-omni/blob/main/examples/online_serving/README.md>.
+Source <https://github.com/vllm-project/vllm/tree/main/examples/online_serving/qwen2_5_omni>.
 
 
 ## 🛠️ Installation
 
-Please refer to [installation](../../../getting_started/installation/README.md).
+Please refer to [README.md](https://github.com/vllm-project/vllm/tree/main/examples/README.md)
 
-## Deploy Qwen/Qwen2.5-Omni-7B
+## Run examples (Qwen2.5-omni)
 
-First launch the OpenAI-compatible inference server
+Launch the server
 ```bash
 vllm serve Qwen/Qwen2.5-Omni-7B --omni --port 8091
 ```
+
 If you have custom stage configs file, launch the server with command below
 ```bash
 vllm serve Qwen/Qwen2.5-Omni-7B --omni --port 8091 --stage-configs-path /path/to/stage_configs_file
 ```
-## Query the model
-Navigate to the example folder
+
+Get into the example folder
 ```bash
 cd examples/online_serving
 ```
-Query the model server with OpenAI Python API client:
+
+Send request via python
 ```bash
 python openai_chat_completion_client_for_multimodal_generation.py --query-type mixed_modalities
 ```
-??? abstract "openai_chat_completion_client_for_multimodal_generation.py"
-    ``````py
-    --8<-- "examples/online_serving/openai_chat_completion_client_for_multimodal_generation.py"
-    ``````
-You can also query the model with `curl` command:
+
+Send request via curl
 ```bash
 bash run_curl_multimodal_generation.sh mixed_modalities
 ```
-??? abstract "run_curl_multimodal_generation.sh"
-    ``````py
-    --8<-- "examples/online_serving/run_curl_multimodal_generation.sh"
-    ``````
 
 ### FAQ
 
@@ -49,16 +44,17 @@ sudo apt install ffmpeg
 
 ## Run Local Web UI Demo
 
-You can also deploy a Gradio Web UI that allows users to interact with the model through a web browser. Below is an example on how to do so with `Qwen/Qwen2.5-Omni-7B`.
+This Web UI demo allows users to interact with the model through a web browser.
 
 ### Running Gradio Demo
 
-Install gradio with `uv pip install "gradio>=5.49.1,<6.0.0"`, then you can launch the web service built on AsyncOmni by
+Once vllm and vllm-omni are installed, you can launch the web service built on AsyncOmniLLM by
 
 ```bash
 python gradio_demo.py  --model Qwen/Qwen2.5-Omni-7B --port 7861
 ```
-Now you can interact with model via the web UI at `http://localhost:7861/` on your local browser.
+
+Then open `http://localhost:7861/` on your local browser to interact with the web UI.
 
 
 ### Options
@@ -78,3 +74,18 @@ python gradio_demo.py \
 - `--port`: Port for the Gradio server (default `7861`).
 - `--stage-configs-path`: Optional path to custom stage configs YAML.
 - `--share`: Set to expose a temporary public link via Gradio.
+
+## Example materials
+
+??? abstract "gradio_demo.py"
+    ``````py
+    --8<-- "/mnt/vllm_open_release/vllm-omni-cursor/vllm-omni/examples/online_serving/qwen2_5_omni/gradio_demo.py"
+    ``````
+??? abstract "openai_chat_completion_client_for_multimodal_generation.py"
+    ``````py
+    --8<-- "/mnt/vllm_open_release/vllm-omni-cursor/vllm-omni/examples/online_serving/qwen2_5_omni/openai_chat_completion_client_for_multimodal_generation.py"
+    ``````
+??? abstract "run_curl_multimodal_generation.sh"
+    ``````sh
+    --8<-- "/mnt/vllm_open_release/vllm-omni-cursor/vllm-omni/examples/online_serving/qwen2_5_omni/run_curl_multimodal_generation.sh"
+    ``````
diff --git a/docs/user_guide/examples/online_serving/qwen3_omni.md b/docs/user_guide/examples/online_serving/qwen3_omni.md
new file mode 100644
index 0000000000..3d1ba770bc
--- /dev/null
+++ b/docs/user_guide/examples/online_serving/qwen3_omni.md
@@ -0,0 +1,95 @@
+# Online serving Example of vLLM-Omni for Qwen3-omni
+
+Source <https://github.com/vllm-project/vllm/tree/main/examples/online_serving/qwen3_omni>.
+
+
+## 🛠️ Installation
+
+Please refer to [README.md](https://github.com/vllm-project/vllm/tree/main/examples/README.md)
+
+## Run examples (Qwen3-Omni)
+
+Launch the server
+```bash
+vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091
+```
+
+If you have custom stage configs file, launch the server with command below
+```bash
+vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 --stage-configs-path /path/to/stage_configs_file
+```
+
+Get into the example folder
+```bash
+cd examples/online_serving
+```
+
+Send request via python
+```bash
+python openai_chat_completion_client_for_multimodal_generation.py --query-type mixed_modalities
+```
+
+Send request via curl
+```bash
+bash run_curl_multimodal_generation.sh mixed_modalities
+```
+
+### FAQ
+
+If you encounter error about backend of librosa, try to install ffmpeg with command below.
+```
+sudo apt update
+sudo apt install ffmpeg
+```
+
+## Run Local Web UI Demo
+
+This Web UI demo allows users to interact with the model through a web browser.
+
+### Running Gradio Demo
+
+Once vllm and vllm-omni are installed, you can launch the web service built on AsyncOmniLLM by
+
+```bash
+python gradio_demo.py  --model Qwen/Qwen3-Omni-30B-A3B-Instruct --port 7861
+```
+
+Then open `http://localhost:7861/` on your local browser to interact with the web UI.
+
+
+### Options
+
+The gradio demo also supports running with an existing API server and can be customized with the following arguments.
+
+
+```bash
+python gradio_demo.py \
+    --model Qwen/Qwen3-Omni-30B-A3B-Instruct \
+    --use-api-server \
+    --api-base http://localhost:8091/v1 \
+    --ip 127.0.0.1 \
+    --port 7861
+```
+
+- `--model`: Model name
+- `--use-api-server`: If set, connect to an existing vLLM HTTP API server instead of running AsyncOmniLLM locally.
+- `--api-base`: Base URL for vllm serve (only used when `use-api-server` is set, default: http://localhost:8091/v1)
+- `--ip`: Host/IP for Gradio server (default: 127.0.0.1)
+- `--port`: Port for Gradio server (default: 7861)
+- `--stage-configs-path`: Path to custom stage configs YAML file (optional)
+- `--share`: Share the Gradio demo publicly (creates a public link)
+
+## Example materials
+
+??? abstract "gradio_demo.py"
+    ``````py
+    --8<-- "/mnt/vllm_open_release/vllm-omni-cursor/vllm-omni/examples/online_serving/qwen3_omni/gradio_demo.py"
+    ``````
+??? abstract "openai_chat_completion_client_for_multimodal_generation.py"
+    ``````py
+    --8<-- "/mnt/vllm_open_release/vllm-omni-cursor/vllm-omni/examples/online_serving/qwen3_omni/openai_chat_completion_client_for_multimodal_generation.py"
+    ``````
+??? abstract "run_curl_multimodal_generation.sh"
+    ``````sh
+    --8<-- "/mnt/vllm_open_release/vllm-omni-cursor/vllm-omni/examples/online_serving/qwen3_omni/run_curl_multimodal_generation.sh"
+    ``````
diff --git a/examples/offline_inference/qwen2_5_omni/README.md b/examples/offline_inference/qwen2_5_omni/README.md
index afdaee13ea..4c52925991 100644
--- a/examples/offline_inference/qwen2_5_omni/README.md
+++ b/examples/offline_inference/qwen2_5_omni/README.md
@@ -23,7 +23,7 @@ Then run the command below.
 ```bash
 bash run_multiple_prompts.sh
 ```
-### Single Prompts
+### Single Prompt
 Get into the example folder
 ```bash
 cd examples/offline_inference/qwen2_5_omni
diff --git a/examples/offline_inference/qwen2_5_omni/end2end.py b/examples/offline_inference/qwen2_5_omni/end2end.py
index c5be18a94a..463db0aa89 100644
--- a/examples/offline_inference/qwen2_5_omni/end2end.py
+++ b/examples/offline_inference/qwen2_5_omni/end2end.py
@@ -16,7 +16,7 @@
 from vllm.sampling_params import SamplingParams
 from vllm.utils import FlexibleArgumentParser
 
-from vllm_omni.entrypoints.omni_llm import OmniLLM
+from vllm_omni.entrypoints.omni import Omni
 
 SEED = 42
 
@@ -140,7 +140,7 @@ def main(args):
     model_name = "Qwen/Qwen2.5-Omni-7B"
     query_result = query_map[args.query_type]()
 
-    omni_llm = OmniLLM(
+    omni_llm = Omni(
         model=model_name,
         log_stats=args.enable_stats,
         log_file=("omni_llm_pipeline.log" if args.enable_stats else None),
diff --git a/examples/offline_inference/qwen2_5_omni/run_single_prompt.sh b/examples/offline_inference/qwen2_5_omni/run_single_prompt.sh
index 5b3c19cdc2..2f2b3ae756 100644
--- a/examples/offline_inference/qwen2_5_omni/run_single_prompt.sh
+++ b/examples/offline_inference/qwen2_5_omni/run_single_prompt.sh
@@ -1,2 +1,2 @@
 python end2end.py --output-wav output_audio \
-                  --query-type use_audio_in_video
+                  --query-type text
diff --git a/examples/offline_inference/qwen3_omni/README.md b/examples/offline_inference/qwen3_omni/README.md
index 766453667f..22c7651632 100644
--- a/examples/offline_inference/qwen3_omni/README.md
+++ b/examples/offline_inference/qwen3_omni/README.md
@@ -1,4 +1,4 @@
-# Offline Example of vLLM-Omni for Qwen2.5-omni
+# Offline Example of vLLM-Omni for Qwen3-omni
 
 ## 🛠️ Installation
 
@@ -15,7 +15,7 @@ Then run the command below.
 ```bash
 bash run_multiple_prompts.sh
 ```
-### Single Prompts
+### Single Prompt
 Get into the example folder
 ```bash
 cd examples/offline_inference/qwen3_omni
@@ -24,6 +24,10 @@ Then run the command below.
 ```bash
 bash run_single_prompt.sh
 ```
+If you have not enough memory, you can set thinker with tensor parallel. Just run the command below.
+```bash
+bash run_single_prompt_tp.sh
+```
 
 ### FAQ
 
diff --git a/examples/offline_inference/qwen3_omni/end2end.py b/examples/offline_inference/qwen3_omni/end2end.py
index c1fa21a621..54f74c6097 100644
--- a/examples/offline_inference/qwen3_omni/end2end.py
+++ b/examples/offline_inference/qwen3_omni/end2end.py
@@ -16,7 +16,7 @@
 from vllm.multimodal.image import convert_image_mode
 from vllm.utils import FlexibleArgumentParser
 
-from vllm_omni.entrypoints.omni_llm import OmniLLM
+from vllm_omni.entrypoints.omni import Omni
 
 SEED = 42
 
@@ -127,8 +127,9 @@ def main(args):
     model_name = "Qwen/Qwen3-Omni-30B-A3B-Instruct"
     query_result = query_map[args.query_type]()
 
-    omni_llm = OmniLLM(
+    omni_llm = Omni(
         model=model_name,
+        stage_configs_path=args.stage_configs_path,
     )
 
     thinker_sampling_params = SamplingParams(
@@ -276,6 +277,12 @@ def parse_args():
         default=None,
         help="Path to a .txt file with one prompt per line (preferred).",
     )
+    parser.add_argument(
+        "--stage-configs-path",
+        type=str,
+        default=None,
+        help="Path to a stage configs file.",
+    )
 
     return parser.parse_args()
 
@@ -283,7 +290,3 @@ def parse_args():
 if __name__ == "__main__":
     args = parse_args()
     main(args)
-
-    # use examples:
-    # python end2end.py --model Qwen/Qwen3-Omni-30B-A3B-Instruct --query-type text
-    # python end2end.py --model /custom_path/Qwen3-Omni-30B-A3B-Instruct --query-type use_video_only
diff --git a/examples/offline_inference/qwen3_omni/qwen3_omni_moe_tp.yaml b/examples/offline_inference/qwen3_omni/qwen3_omni_moe_tp.yaml
new file mode 100644
index 0000000000..8d9c2900e6
--- /dev/null
+++ b/examples/offline_inference/qwen3_omni/qwen3_omni_moe_tp.yaml
@@ -0,0 +1,94 @@
+# Stage config for running Qwen3-Omni-MoE with 3-stage architecture
+# Stage 0: Thinker (multimodal understanding + text generation)
+# Stage 1: Talker (text embeddings → 8-layer RVQ codec codes)
+# Stage 2: Code2Wav (8-layer RVQ codes → audio waveform)
+
+stage_args:
+  - stage_id: 0
+    runtime:
+      devices: "0,1"
+      max_batch_size: 1
+    engine_args:
+      model_stage: thinker
+      model_arch: Qwen3OmniMoeForConditionalGeneration
+      worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker
+      scheduler_cls: vllm_omni.core.sched.scheduler.OmniScheduler
+      gpu_memory_utilization: 0.8
+      enforce_eager: true
+      trust_remote_code: true
+      engine_output_type: latent  # Output hidden states for talker
+      distributed_executor_backend: "mp"
+      enable_prefix_caching: false
+      hf_config_name: thinker_config
+      tensor_parallel_size: 2
+    final_output: true
+    final_output_type: text
+    is_comprehension: true
+    default_sampling_params:
+      temperature: 0.4
+      top_p: 0.9
+      top_k: 1
+      max_tokens: 2048
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.05
+
+  - stage_id: 1
+    runtime:
+       devices: "1"
+       max_batch_size: 1
+    engine_args:
+       model_stage: talker
+       model_arch: Qwen3OmniMoeForConditionalGeneration
+       worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker
+       scheduler_cls: vllm_omni.core.sched.scheduler.OmniScheduler
+       gpu_memory_utilization: 0.5
+       enforce_eager: true
+       trust_remote_code: true
+       engine_output_type: latent  # Output codec codes for code2wav
+      #  tensor_parallel_size: 2
+       enable_prefix_caching: false
+       distributed_executor_backend: "mp"
+       hf_config_name: talker_config
+    engine_input_source: [0]
+    custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker
+    # final_output: true
+    # final_output_type: text
+    default_sampling_params:
+      temperature: 0.9
+      top_k: 50
+      max_tokens: 4096
+      seed: 42
+      detokenize: False
+      repetition_penalty: 1.05
+      stop_token_ids: [2150]
+
+  - stage_id: 2
+    runtime:
+      devices: "1"
+      max_batch_size: 1
+    engine_args:
+      model_stage: code2wav
+      model_arch: Qwen3OmniMoeForConditionalGeneration
+      worker_cls: vllm_omni.worker.gpu_generation_worker.GPUGenerationWorker
+      scheduler_cls: vllm_omni.core.sched.generation_scheduler.GenerationScheduler
+      enforce_eager: true
+      trust_remote_code: true
+      enable_prefix_caching: false
+      engine_output_type: audio  # Final output: audio waveform
+      gpu_memory_utilization: 0.1
+      distributed_executor_backend: "mp"
+      max_num_batched_tokens: 1000000
+      hf_config_name: thinker_config
+    engine_input_source: [1]
+    custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav
+    final_output: true
+    final_output_type: audio
+    default_sampling_params:
+      temperature: 0.0
+      top_p: 1.0
+      top_k: -1
+      max_tokens: 65536
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.1
diff --git a/examples/offline_inference/qwen3_omni/run_single_prompt_tp.sh b/examples/offline_inference/qwen3_omni/run_single_prompt_tp.sh
new file mode 100644
index 0000000000..eb25b6bc05
--- /dev/null
+++ b/examples/offline_inference/qwen3_omni/run_single_prompt_tp.sh
@@ -0,0 +1,6 @@
+python end2end.py --output-wav output_audio \
+                  --query-type use_audio \
+                  --init-sleep-seconds 90 \
+                  --stage-configs-path qwen3_omni_moe_tp.yaml
+
+# init-sleep-seconds works to avoid two vLLM stages initialized at the same time within a card.
diff --git a/examples/online_serving/qwen3_omni/README.md b/examples/online_serving/qwen3_omni/README.md
index 01a0fdd286..6e4b27ea38 100644
--- a/examples/online_serving/qwen3_omni/README.md
+++ b/examples/online_serving/qwen3_omni/README.md
@@ -1,4 +1,4 @@
-# Online serving Example of vLLM-omni for Qwen2.5-omni
+# Online serving Example of vLLM-Omni for Qwen3-omni
 
 ## 🛠️ Installation
 
diff --git a/mkdocs.yml b/mkdocs.yml
index dfa29f59bd..31ca350fa4 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -58,6 +58,7 @@ theme:
 hooks:
   - docs/mkdocs/hooks/generate_api_readme.py
   - docs/mkdocs/hooks/url_schemes.py
+  - docs/mkdocs/hooks/generate_examples.py
 
 # Plugins
 plugins:
diff --git a/vllm_omni/__init__.py b/vllm_omni/__init__.py
index 89c7cbfc6c..fd7f9174fb 100644
--- a/vllm_omni/__init__.py
+++ b/vllm_omni/__init__.py
@@ -24,7 +24,7 @@
 from .entrypoints.async_omni_llm import AsyncOmniLLM
 
 # Main entry points
-# from .entrypoints.omni import Omni
+from .entrypoints.omni import Omni
 
 from .version import __version__, __version_tuple__  # isort:skip
 
@@ -33,7 +33,7 @@
     "__version__",
     "__version_tuple__",
     # Main components
-    # "Omni",
+    "Omni",
     "AsyncOmniLLM",
     # Configuration
     "OmniModelConfig",
diff --git a/vllm_omni/diffusion/models/__init__.py b/vllm_omni/diffusion/models/__init__.py
new file mode 100644
index 0000000000..9e7471a2da
--- /dev/null
+++ b/vllm_omni/diffusion/models/__init__.py
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Diffusion model implementations."""
diff --git a/vllm_omni/diffusion/models/qwen_image/__init__.py b/vllm_omni/diffusion/models/qwen_image/__init__.py
index 69d6821f4f..84fa2259d4 100644
--- a/vllm_omni/diffusion/models/qwen_image/__init__.py
+++ b/vllm_omni/diffusion/models/qwen_image/__init__.py
@@ -1,3 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Qwen Image model components."""
+"""Qwen Image diffusion model components."""
+
+from vllm_omni.diffusion.models.qwen_image.pipeline_qwen_image import (
+    QwenImagePipeline,
+    get_qwen_image_post_process_func,
+)
+from vllm_omni.diffusion.models.qwen_image.qwen_image_transformer import (
+    QwenImageTransformer2DModel,
+)
+
+__all__ = [
+    "QwenImagePipeline",
+    "QwenImageTransformer2DModel",
+    "get_qwen_image_post_process_func",
+]
diff --git a/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image.py b/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image.py
index cbedc5c317..69917361a1 100644
--- a/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image.py
+++ b/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image.py
@@ -392,8 +392,6 @@ def encode_prompt(
         Args:
             prompt (`str` or `list[str]`, *optional*):
                 prompt to be encoded
-            device: (`torch.device`):
-                torch device
             num_images_per_prompt (`int`):
                 number of images that should be generated per prompt
             prompt_embeds (`torch.Tensor`, *optional*):
diff --git a/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py b/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py
index c0ef5fd123..b2f4e35bdb 100644
--- a/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py
+++ b/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py
@@ -113,10 +113,12 @@ def __init__(self, theta: int, axes_dim: list[int], scale_rope=False):
         # DO NOT USING REGISTER BUFFER HERE, IT WILL CAUSE COMPLEX NUMBERS LOSE ITS IMAGINARY PART
         self.scale_rope = scale_rope
 
-    def rope_params(self, index, dim, theta=10000):
+    def rope_params(self, index: torch.Tensor, dim: int, theta: int = 10000):
         """
         Args:
-            index: [0, 1, 2, 3] 1D Tensor representing the position index of the token
+            index (`torch.Tensor`): [0, 1, 2, 3] 1D Tensor representing the position index of the token
+            dim (`int`): Dimension for the rope parameters
+            theta (`int`): Theta parameter for rope
         """
         assert dim % 2 == 0
         freqs = torch.outer(
diff --git a/vllm_omni/diffusion/worker/__init__.py b/vllm_omni/diffusion/worker/__init__.py
index 068a25f8f9..dc3306dae3 100644
--- a/vllm_omni/diffusion/worker/__init__.py
+++ b/vllm_omni/diffusion/worker/__init__.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Diffusion worker components."""
+"""Worker classes for diffusion models."""
 
 from vllm_omni.diffusion.worker.gpu_worker import GPUWorker, WorkerProc
 
diff --git a/vllm_omni/entrypoints/async_omni_llm.py b/vllm_omni/entrypoints/async_omni_llm.py
index 5046196d05..c45acdc44a 100644
--- a/vllm_omni/entrypoints/async_omni_llm.py
+++ b/vllm_omni/entrypoints/async_omni_llm.py
@@ -94,7 +94,7 @@ def __init__(
         shm_threshold_bytes: int = 65536,
         batch_timeout: int = 10,
         init_timeout: int = 60000,
-        **kwargs,
+        **kwargs: Any,
     ):
         self.batch_timeout = batch_timeout
         self._enable_stats: bool = bool(log_stats)
diff --git a/vllm_omni/entrypoints/omni_llm.py b/vllm_omni/entrypoints/omni_llm.py
index 4b9b1b4f64..94d93ef767 100644
--- a/vllm_omni/entrypoints/omni_llm.py
+++ b/vllm_omni/entrypoints/omni_llm.py
@@ -534,8 +534,9 @@ def __init__(
         self,
         model: str,
         compilation_config: Optional[Union[int, dict[str, Any], CompilationConfig]] = None,
+        hf_overrides: Optional[dict[str, Any]] = None,
         structured_outputs_config: Optional[Union[dict[str, Any], StructuredOutputsConfig]] = None,
-        **kwargs,
+        **kwargs: Any,
     ):
         """LLM constructor."""
         if "disable_log_stats" not in kwargs:
diff --git a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_code_predictor_mtp.py b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_code_predictor_mtp.py
index 49215be36d..5c5c5b15d1 100644
--- a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_code_predictor_mtp.py
+++ b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_code_predictor_mtp.py
@@ -6,6 +6,7 @@
 autoregressively, predicting layers 1 to N based on layer-0 codes from the talker.
 """
 
+from collections import namedtuple
 from typing import Any, Optional
 
 import torch
@@ -164,21 +165,48 @@ def forward(
 
         from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
 
-        attention_interface = ALL_ATTENTION_FUNCTIONS["flash_attention_2"]
-        attn_output, _ = attention_interface(
-            self,
-            q_heads,
-            k_heads,
-            v_heads,
-            None,
-            dropout=0.0 if not self.training else self.attention_dropout,
-            scaling=self.head_dim**-0.5,
-            sliding_window=None,
-            use_cache=use_cache,
-            position_ids=position_ids,
-            output_hidden_states=True,
-            output_attentions=False,
-        )
+        # Try attention backends in order of preference, with runtime error handling
+        # This handles cases where the backend is registered but not actually available
+        attention_backends = ["flash_attention_2", "xformers", "eager", "sdpa"]
+        attn_output = None
+        last_error = None
+
+        for backend_name in attention_backends:
+            if backend_name not in ALL_ATTENTION_FUNCTIONS:
+                continue
+
+            try:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[backend_name]
+                attn_output, _ = attention_interface(
+                    self,
+                    q_heads,
+                    k_heads,
+                    v_heads,
+                    None,
+                    dropout=0.0 if not self.training else getattr(self, "attention_dropout", 0.0),
+                    scaling=self.head_dim**-0.5,
+                    sliding_window=None,
+                    use_cache=use_cache,
+                    position_ids=position_ids,
+                    output_hidden_states=True,
+                    output_attentions=False,
+                )
+                # Success - log fallback if not using flash_attention_2
+                if backend_name != "flash_attention_2":
+                    logger.warning_once(
+                        f"Using {backend_name} attention backend (flash_attention_2 not available or failed)"
+                    )
+                break
+            except (ValueError, ImportError, RuntimeError, AttributeError) as e:
+                # Store error and try next backend
+                last_error = e
+                continue
+
+        if attn_output is None:
+            raise RuntimeError(
+                f"All attention backends failed. Last error: {last_error}. "
+                "Please install flash-attn, or ensure PyTorch's scaled_dot_product_attention is available."
+            )
         attn_output = attn_output.reshape(*(hidden_states.shape[:-1]), -1).contiguous()
 
         attn_output = self.o_proj(attn_output)
@@ -373,16 +401,22 @@ def forward(
         past_key_values: Optional[Any] = None,
         use_cache: Optional[bool] = False,
         cache_position: Optional[torch.LongTensor] = None,
-        **kwargs,
-    ):
+        **kwargs: Any,
+    ) -> Any:
         """
         Forward pass matching HF structure.
 
         Args:
             inputs_embeds: [batch, seq_len, hidden_size]
+            attention_mask: Optional attention mask tensor
+            position_ids: Optional position IDs tensor
+            past_key_values: Optional cached key-value pairs
+            use_cache: Whether to use cache
+            cache_position: Optional cache position tensor
+            **kwargs: Additional keyword arguments
 
         Returns:
-            Object with .last_hidden_state attribute
+            Named tuple with .last_hidden_state and .past_key_values attributes
         """
         batch_size, seq_len, _ = inputs_embeds.shape
 
@@ -421,8 +455,6 @@ def forward(
         hidden_states = self.norm(hidden_states)
 
         # Return in HF-compatible format
-        from collections import namedtuple
-
         Output = namedtuple("Output", ["last_hidden_state", "past_key_values"])
         return Output(last_hidden_state=hidden_states, past_key_values=None)  # [batch, num_code_groups-1, hidden_size]
 
diff --git a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_talker.py b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_talker.py
index 34e05925f9..91231d0fbc 100644
--- a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_talker.py
+++ b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_talker.py
@@ -1,4 +1,5 @@
 from collections.abc import Iterable
+from typing import Any
 
 import torch
 import torch.nn as nn
@@ -284,7 +285,7 @@ def code_predictor_forward(
 
         return result_codes, summed_embeddings
 
-    def init_multi_modal(self, thinker_config):
+    def init_multi_modal(self, thinker_config: Any) -> None:
         """
         Initialize multimodal components from the thinker.
 
diff --git a/vllm_omni/model_executor/stage_input_processors/qwen3_omni.py b/vllm_omni/model_executor/stage_input_processors/qwen3_omni.py
index e2d07aa1bb..b3b5500ef1 100644
--- a/vllm_omni/model_executor/stage_input_processors/qwen3_omni.py
+++ b/vllm_omni/model_executor/stage_input_processors/qwen3_omni.py
@@ -3,7 +3,7 @@
 # Copyright 2025 The Qwen team.
 """Stage input processor for Qwen3 Omni MoE: Thinker → Talker transition."""
 
-from typing import Union
+from typing import Any, Union
 
 import torch
 from vllm.inputs import TextPrompt
@@ -47,11 +47,11 @@ def _compute_talker_prompt_ids_length(info):
 
 
 def thinker2talker(
-    stage_list,
-    engine_input_source,
-    prompt: Union[OmniTokensPrompt, TextPrompt] = None,
+    stage_list: list[Any],
+    engine_input_source: list[int],
+    prompt: Union[OmniTokensPrompt, TextPrompt, None] = None,
     requires_multimodal_data: bool = False,
-):
+) -> list[OmniTokensPrompt]:
     """
     Process thinker outputs to create talker inputs.
 
@@ -64,6 +64,7 @@ def thinker2talker(
         stage_list: List of stage objects
         engine_input_source: Source stage IDs (typically [0] for thinker)
         prompt: Original prompt data
+        requires_multimodal_data: Whether multimodal data is required
 
     Returns:
         List of OmniTokensPrompt for talker stage
@@ -111,11 +112,11 @@ def thinker2talker(
 
 
 def talker2code2wav(
-    stage_list,
-    engine_input_source,
-    prompt: Union[OmniTokensPrompt, TextPrompt] = None,
+    stage_list: list[Any],
+    engine_input_source: list[int],
+    prompt: Union[OmniTokensPrompt, TextPrompt, None] = None,
     requires_multimodal_data: bool = False,
-):
+) -> list[OmniTokensPrompt]:
     """
     Process talker outputs to create code2wav inputs.
 
@@ -128,6 +129,7 @@ def talker2code2wav(
         stage_list: List of stage objects
         engine_input_source: Source stage IDs (typically [1] for talker)
         prompt: Original prompt data
+        requires_multimodal_data: Whether multimodal data is required
 
     Returns:
         List of OmniTokensPrompt for code2wav stage