diff --git a/.gitignore b/.gitignore
index e38e2dc747..1c15c43954 100644
--- a/.gitignore
+++ b/.gitignore
@@ -162,9 +162,6 @@ docker/
# scripts
/scripts/
-# tests
-tests/
-
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be added to the global gitignore or merged into this project gitignore. For a PyCharm
diff --git a/docs/.nav.yml b/docs/.nav.yml
index fb9c7fb443..7bada971c6 100644
--- a/docs/.nav.yml
+++ b/docs/.nav.yml
@@ -1,44 +1,46 @@
nav:
- - Home: README.md
- - User Guide:
- - Getting Started:
- - getting_started/quickstart.md
- - getting_started/installation
- - Examples:
- - examples/README.md
- - Offline Inference:
- - Qwen2.5-Omni: user_guide/examples/offline_inference/qwen2_5_omni.md
- - Qwen2.5-Image: user_guide/examples/offline_inference/qwen_image.md
- - Online Serving:
- - Qwen2.5-Omni: user_guide/examples/online_serving/qwen2_5_omni.md
- - General:
- - usage/*
- - Configuration:
- - configuration/README.md
- - configuration/*
- - Models:
- - models/supported_models.md
- - Developer Guide:
- - General:
- - contributing/README.md
- - glob: contributing/*
- flatten_single_child_sections: true
- - Model Implementation:
- - contributing/model/README.md
- - CI: contributing/ci
- - Design Documents:
- - design/index.md
- - design/architecture_overview.md
- - design/vllm_omni_design.md
- - design/mrs_design.md
- - design/api_design_doc.md
- - Docs Guide: contributing/DOCS_GUIDE.md
- - API Reference:
- - api/README.md
- - api/vllm_omni
- - CLI Reference: cli
- - Community:
- - community/*
- - Slack: https://slack.vllm.ai
- - Blog: https://blog.vllm.ai
- - Forum: https://discuss.vllm.ai
+- Home: README.md
+- User Guide:
+ - Getting Started:
+ - getting_started/quickstart.md
+ - getting_started/installation
+ - Examples:
+ - examples/README.md
+ - Offline Inference:
+ - Offline Example of vLLM-Omni for Qwen2.5-omni: user_guide/examples/offline_inference/qwen2_5_omni.md
+ - Offline Example of vLLM-Omni for Qwen3-omni: user_guide/examples/offline_inference/qwen3_omni.md
+ - Qwen-Image Offline Inference: user_guide/examples/offline_inference/qwen_image.md
+ - Online Serving:
+ - Online serving Example of vLLM-Omni for Qwen2.5-omni: user_guide/examples/online_serving/qwen2_5_omni.md
+ - Online serving Example of vLLM-Omni for Qwen3-omni: user_guide/examples/online_serving/qwen3_omni.md
+ - General:
+ - usage/*
+ - Configuration:
+ - configuration/README.md
+ - configuration/*
+ - Models:
+ - models/supported_models.md
+- Developer Guide:
+ - General:
+ - contributing/README.md
+ - glob: contributing/*
+ flatten_single_child_sections: true
+ - Model Implementation:
+ - contributing/model/README.md
+ - CI: contributing/ci
+ - Design Documents:
+ - design/index.md
+ - design/architecture_overview.md
+ - design/vllm_omni_design.md
+ - design/mrs_design.md
+ - design/api_design_doc.md
+ - Docs Guide: contributing/DOCS_GUIDE.md
+- API Reference:
+ - api/README.md
+ - api/vllm_omni
+- CLI Reference: cli
+- Community:
+ - community/*
+ - Slack: https://slack.vllm.ai
+ - Blog: https://blog.vllm.ai
+ - Forum: https://discuss.vllm.ai
diff --git a/docs/api/README.md b/docs/api/README.md
index 100e17ad44..bb4e9d8777 100644
--- a/docs/api/README.md
+++ b/docs/api/README.md
@@ -34,6 +34,8 @@ Engine classes for offline and online inference.
- [vllm_omni.diffusion.diffusion_engine.DiffusionEngine][]
- [vllm_omni.engine.AdditionalInformationEntry][]
- [vllm_omni.engine.AdditionalInformationPayload][]
+- [vllm_omni.engine.OmniEngineCoreOutput][]
+- [vllm_omni.engine.OmniEngineCoreOutputs][]
- [vllm_omni.engine.OmniEngineCoreRequest][]
- [vllm_omni.engine.PromptEmbedsPayload][]
- [vllm_omni.engine.arg_utils.AsyncOmniEngineArgs][]
@@ -48,6 +50,7 @@ Core scheduling and caching components.
- [vllm_omni.core.dit_cache_manager.DiTCacheManager][]
- [vllm_omni.core.sched.diffusion_scheduler.DiffusionScheduler][]
+- [vllm_omni.core.sched.generation_scheduler.GenerationScheduler][]
- [vllm_omni.core.sched.output.OmniNewRequestData][]
- [vllm_omni.core.sched.scheduler.OmniScheduler][]
@@ -55,7 +58,7 @@ Core scheduling and caching components.
Model execution components.
-- [vllm_omni.model_executor.models.qwen2_5_omni.qwen2_5_omni.OmniOutput][]
+- [vllm_omni.model_executor.models.output_templates.OmniOutput][]
- [vllm_omni.model_executor.models.qwen2_5_omni.qwen2_5_omni.Qwen2_5OmniForConditionalGeneration][]
- [vllm_omni.model_executor.models.qwen2_5_omni.qwen2_5_omni_talker.Qwen2_5OmniTalkerForConditionalGeneration][]
- [vllm_omni.model_executor.models.qwen2_5_omni.qwen2_5_omni_thinker.Qwen2_5OmniAudioFeatureInputs][]
@@ -72,6 +75,22 @@ Model execution components.
- [vllm_omni.model_executor.models.qwen2_5_omni.qwen2_old.Qwen2EmbeddingModel][]
- [vllm_omni.model_executor.models.qwen2_5_omni.qwen2_old.Qwen2ForCausalLM][]
- [vllm_omni.model_executor.models.qwen2_5_omni.qwen2_old.Qwen2Model][]
+- [vllm_omni.model_executor.models.qwen3_omni.qwen3_moe.Qwen3MoeForCausalLM][]
+- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni.Qwen3OmniMoeForConditionalGeneration][]
+- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_code2wav.Qwen3OmniMoeCode2Wav][]
+- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_code_predictor_mtp.Qwen3OmniCodePredictorBaseModel][]
+- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_code_predictor_mtp.Qwen3OmniMoeTalkerCodePredictor][]
+- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_talker.Qwen3OmniMoeModel][]
+- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_talker.Qwen3OmniMoeTalkerForConditionalGeneration][]
+- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_thinker.Qwen3MoeLLMForCausalLM][]
+- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_thinker.Qwen3MoeLLMModel][]
+- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_thinker.Qwen3OmniMoeConditionalGenerationMixin][]
+- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_thinker.Qwen3OmniMoeThinkerForConditionalGeneration][]
+- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_thinker.Qwen3OmniMoeThinkerMultiModalProcessor][]
+- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_thinker.Qwen3OmniMoeThinkerProcessingInfo][]
+- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_thinker.Qwen3Omni_VisionTransformer][]
+- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_thinker.Qwen3_VisionPatchEmbed][]
+- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_thinker.Qwen3_VisionPatchMerger][]
## Configuration
@@ -89,4 +108,6 @@ Worker classes and model runners for distributed inference.
- [vllm_omni.worker.gpu_ar_worker.GPUARWorker][]
- [vllm_omni.worker.gpu_diffusion_model_runner.GPUDiffusionModelRunner][]
- [vllm_omni.worker.gpu_diffusion_worker.GPUDiffusionWorker][]
+- [vllm_omni.worker.gpu_generation_model_runner.GPUGenerationModelRunner][]
+- [vllm_omni.worker.gpu_generation_worker.GPUGenerationWorker][]
- [vllm_omni.worker.gpu_model_runner.OmniGPUModelRunner][]
diff --git a/docs/mkdocs/hooks/generate_examples.py b/docs/mkdocs/hooks/generate_examples.py
new file mode 100644
index 0000000000..46bea983f2
--- /dev/null
+++ b/docs/mkdocs/hooks/generate_examples.py
@@ -0,0 +1,335 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import itertools
+import logging
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Literal
+
+import regex as re
+import yaml
+
+logger = logging.getLogger("mkdocs")
+
+ROOT_DIR = Path(__file__).parent.parent.parent.parent
+ROOT_DIR_RELATIVE = "../../../../.."
+EXAMPLE_DIR = ROOT_DIR / "examples"
+EXAMPLE_DOC_DIR = ROOT_DIR / "docs/user_guide/examples"
+NAV_FILE = ROOT_DIR / "docs/.nav.yml"
+
+
+def fix_case(text: str) -> str:
+ subs = {
+ "api": "API",
+ "cli": "CLI",
+ "cpu": "CPU",
+ "llm": "LLM",
+ "mae": "MAE",
+ "tpu": "TPU",
+ "gguf": "GGUF",
+ "lora": "LoRA",
+ "rlhf": "RLHF",
+ "vllm": "vLLM",
+ "openai": "OpenAI",
+ "lmcache": "LMCache",
+ "multilora": "MultiLoRA",
+ "mlpspeculator": "MLPSpeculator",
+ r"fp\d+": lambda x: x.group(0).upper(), # e.g. fp16, fp32
+ r"int\d+": lambda x: x.group(0).upper(), # e.g. int8, int16
+ }
+ for pattern, repl in subs.items():
+ text = re.sub(rf"\b{pattern}\b", repl, text, flags=re.IGNORECASE)
+ return text
+
+
+@dataclass
+class Example:
+ """
+ Example class for generating documentation content from a given path.
+
+ Attributes:
+ path (Path): The path to the main directory or file.
+ category (str): The category of the document.
+ main_file (Path): The main file in the directory.
+ other_files (list[Path]): list of other files in the directory.
+ title (str): The title of the document.
+
+ Methods:
+ __post_init__(): Initializes the main_file, other_files, and title attributes.
+ determine_main_file() -> Path: Determines the main file in the given path.
+ determine_other_files() -> list[Path]: Determines other files in the directory excluding the main file.
+ determine_title() -> str: Determines the title of the document.
+ generate() -> str: Generates the documentation content.
+ """ # noqa: E501
+
+ path: Path
+ category: str = None
+ main_file: Path = field(init=False)
+ other_files: list[Path] = field(init=False)
+ title: str = field(init=False)
+
+ def __post_init__(self):
+ self.main_file = self.determine_main_file()
+ self.other_files = self.determine_other_files()
+ self.title = self.determine_title()
+
+ @property
+ def is_code(self) -> bool:
+ return self.main_file.suffix != ".md"
+
+ def determine_main_file(self) -> Path:
+ """
+ Determines the main file in the given path.
+ If the path is a file, it returns the path itself. Otherwise, it searches
+ for Markdown files (*.md) in the directory and returns the first one found.
+ Returns:
+ Path: The main file path, either the original path if it's a file or the first
+ Markdown file found in the directory.
+ Raises:
+ IndexError: If no Markdown files are found in the directory.
+ """ # noqa: E501
+ return self.path if self.path.is_file() else list(self.path.glob("*.md")).pop()
+
+ def determine_other_files(self) -> list[Path]:
+ """
+ Determine other files in the directory excluding the main file.
+
+ This method checks if the given path is a file. If it is, it returns an empty list.
+ Otherwise, it recursively searches through the directory and returns a list of all
+ files that are not the main file.
+
+ Returns:
+ list[Path]: A list of Path objects representing the other files in the directory.
+ """ # noqa: E501
+ if self.path.is_file():
+ return []
+ # Binary file extensions to exclude
+ binary_extensions = {
+ ".wav",
+ ".mp3",
+ ".mp4",
+ ".avi",
+ ".mov",
+ ".mkv", # Audio/Video
+ ".png",
+ ".jpg",
+ ".jpeg",
+ ".gif",
+ ".bmp",
+ ".ico",
+ ".svg", # Images
+ ".pdf",
+ ".zip",
+ ".tar",
+ ".gz",
+ ".bz2",
+ ".xz", # Archives/Documents
+ ".exe",
+ ".so",
+ ".dll",
+ ".dylib", # Binaries
+ ".bin",
+ ".dat",
+ ".db",
+ ".sqlite", # Data files
+ }
+
+ def is_other_file(file: Path) -> bool:
+ return file.is_file() and file != self.main_file and file.suffix.lower() not in binary_extensions
+
+ return [file for file in self.path.rglob("*") if is_other_file(file)]
+
+ def determine_title(self) -> str:
+ if not self.is_code:
+ # Specify encoding for building on Windows
+ with open(self.main_file, encoding="utf-8") as f:
+ first_line = f.readline().strip()
+ match = re.match(r"^#\s+(?P
.+)$", first_line)
+ if match:
+ return match.group("title")
+ return fix_case(self.path.stem.replace("_", " ").title())
+
+ def fix_relative_links(self, content: str) -> str:
+ """
+ Fix relative links in markdown content by converting them to gh-file
+ format.
+
+ Args:
+ content (str): The markdown content to process
+
+ Returns:
+ str: Content with relative links converted to gh-file format
+ """
+ # Regex to match markdown links [text](relative_path)
+ # This matches links that don't start with http, https, ftp, or #
+ link_pattern = r"\[([^\]]*)\]\((?!(?:https?|ftp)://|#)([^)]+)\)"
+
+ def replace_link(match):
+ link_text = match.group(1)
+ relative_path = match.group(2)
+
+ # Make relative to repo root
+ gh_file = (self.main_file.parent / relative_path).resolve()
+ gh_file = gh_file.relative_to(ROOT_DIR)
+
+ # Make GitHub URL
+ url = "https://github.com/vllm-project/vllm/"
+ url += "tree/main" if self.path.is_dir() else "blob/main"
+ gh_url = f"{url}/{gh_file}"
+
+ return f"[{link_text}]({gh_url})"
+
+ return re.sub(link_pattern, replace_link, content)
+
+ def generate(self) -> str:
+ content = f"# {self.title}\n\n"
+ url = "https://github.com/vllm-project/vllm/"
+ url += "tree/main" if self.path.is_dir() else "blob/main"
+ content += f"Source <{url}/{self.path.relative_to(ROOT_DIR)}>.\n\n"
+
+ # Use long code fence to avoid issues with
+ # included files containing code fences too
+ code_fence = "``````"
+
+ if self.is_code:
+ content += f'{code_fence}{self.main_file.suffix[1:]}\n--8<-- "{self.main_file}"\n{code_fence}\n'
+ else:
+ with open(self.main_file) as f:
+ # Skip the title from md snippets as it's been included above
+ main_content = f.readlines()[1:]
+ content += self.fix_relative_links("".join(main_content))
+ content += "\n"
+
+ if not self.other_files:
+ return content
+
+ content += "## Example materials\n\n"
+ for file in sorted(self.other_files):
+ content += f'??? abstract "{file.relative_to(self.path)}"\n'
+ if file.suffix != ".md":
+ content += f" {code_fence}{file.suffix[1:]}\n"
+ content += f' --8<-- "{file}"\n'
+ if file.suffix != ".md":
+ content += f" {code_fence}\n"
+
+ return content
+
+
+def update_nav_file(examples: list[Example]):
+ """
+ Update the .nav.yml file to include all generated examples.
+ This function completely regenerates the examples section based on the actual
+ folder structure, ensuring consistency between the examples folder and nav file.
+
+ Args:
+ examples: List of Example objects that have been generated
+ """
+ if not NAV_FILE.exists():
+ logger.warning("Navigation file not found: %s", NAV_FILE)
+ return
+
+ # Read the current nav file
+ with open(NAV_FILE, encoding="utf-8") as f:
+ nav_data = yaml.safe_load(f) or {}
+
+ nav_list = nav_data.get("nav", [])
+
+ # Find the "User Guide" section
+ user_guide_idx = None
+ examples_idx = None
+ for i, item in enumerate(nav_list):
+ if isinstance(item, dict) and "User Guide" in item:
+ user_guide_idx = i
+ user_guide_content = item["User Guide"]
+ # Find the "Examples" subsection
+ for j, subitem in enumerate(user_guide_content):
+ if isinstance(subitem, dict) and "Examples" in subitem:
+ examples_idx = j
+ break
+ break
+
+ if user_guide_idx is None or examples_idx is None:
+ logger.warning("Could not find 'User Guide' -> 'Examples' section in nav file")
+ return
+
+ # Get existing Examples section to preserve non-example items (like README.md)
+ existing_examples_content = nav_list[user_guide_idx]["User Guide"][examples_idx]["Examples"]
+
+ # Preserve string items (like "examples/README.md") that are not example categories
+ preserved_items = [
+ item
+ for item in existing_examples_content
+ if isinstance(item, str) and not item.startswith("user_guide/examples/")
+ ]
+
+ # Group examples by category
+ examples_by_category = {}
+ for example in examples:
+ category = example.category
+ if category not in examples_by_category:
+ examples_by_category[category] = []
+ examples_by_category[category].append(example)
+
+ # Build the new Examples section - start with preserved items
+ examples_section = preserved_items.copy()
+
+ # Add examples grouped by category, sorted by category name
+ for category in sorted(examples_by_category.keys()):
+ category_examples = sorted(examples_by_category[category], key=lambda e: e.path.stem)
+ category_items = []
+ for example in category_examples:
+ doc_path = EXAMPLE_DOC_DIR / example.category / f"{example.path.stem}.md"
+ rel_path = doc_path.relative_to(ROOT_DIR / "docs")
+ category_items.append({example.title: str(rel_path)})
+
+ if category_items:
+ # Format category name (e.g., "offline_inference" -> "Offline Inference")
+ category_title = fix_case(category.replace("_", " ").title())
+ examples_section.append({category_title: category_items})
+
+ # Update the nav structure
+ nav_list[user_guide_idx]["User Guide"][examples_idx]["Examples"] = examples_section
+
+ # Write back to file
+ nav_data["nav"] = nav_list
+ with open(NAV_FILE, "w", encoding="utf-8") as f:
+ yaml.dump(nav_data, f, default_flow_style=False, sort_keys=False, allow_unicode=True)
+ logger.info("Updated navigation file: %s", NAV_FILE.relative_to(ROOT_DIR))
+
+
+def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
+ logger.info("Generating example documentation")
+ logger.debug("Root directory: %s", ROOT_DIR.resolve())
+ logger.debug("Example directory: %s", EXAMPLE_DIR.resolve())
+ logger.debug("Example document directory: %s", EXAMPLE_DOC_DIR.resolve())
+
+ # Create the EXAMPLE_DOC_DIR if it doesn't exist
+ if not EXAMPLE_DOC_DIR.exists():
+ EXAMPLE_DOC_DIR.mkdir(parents=True)
+
+ categories = sorted(p for p in EXAMPLE_DIR.iterdir() if p.is_dir())
+
+ examples = []
+ glob_patterns = ["*.py", "*.md", "*.sh"]
+ # Find categorised examples
+ for category in categories:
+ globs = [category.glob(pattern) for pattern in glob_patterns]
+ for path in itertools.chain(*globs):
+ examples.append(Example(path, category.stem))
+ # Find examples in subdirectories
+ for path in category.glob("*/*.md"):
+ examples.append(Example(path.parent, category.stem))
+
+ # Generate the example documentation
+ for example in sorted(examples, key=lambda e: e.path.stem):
+ example_name = f"{example.path.stem}.md"
+ doc_path = EXAMPLE_DOC_DIR / example.category / example_name
+ if not doc_path.parent.exists():
+ doc_path.parent.mkdir(parents=True)
+ # Specify encoding for building on Windows
+ with open(doc_path, "w+", encoding="utf-8") as f:
+ f.write(example.generate())
+ logger.debug("Example generated: %s", doc_path.relative_to(ROOT_DIR))
+
+ # Update the navigation file
+ update_nav_file(examples)
diff --git a/docs/user_guide/examples/offline_inference/qwen2_5_omni.md b/docs/user_guide/examples/offline_inference/qwen2_5_omni.md
index c11360bcfe..7ec3ed3afe 100644
--- a/docs/user_guide/examples/offline_inference/qwen2_5_omni.md
+++ b/docs/user_guide/examples/offline_inference/qwen2_5_omni.md
@@ -1,55 +1,40 @@
-# Offline Inference Example of vLLM-Omni for Qwen2.5-Omni
+# Offline Example of vLLM-Omni for Qwen2.5-omni
-Source .
+Source .
## đ ī¸ Installation
-Please refer to [installation](../../../getting_started/installation/README.md).
-
-## Run Offline inference with Qwen2.5-Omni.
-First, navigate to the example folder
-```bash
-cd examples/offline_inference/qwen2_5_omni
-```
-Inside the directory, `end2end.py` is a comprehensive demo suite for initializing a model instance of `Qwen/Qwen2.5-Omni-7B` and use it for various offline inference tasks.
-??? abstract "end2end.py"
- ``````py
- --8<-- "examples/offline_inference/qwen2_5_omni/end2end.py"
- ``````
-
-Below we also provide simple bash scripts to execute this file.
-### Single Prompt
-```bash
-bash run_single_prompt.sh
-```
-??? abstract "run_single_prompt.sh"
- ``````sh
- --8<-- "examples/offline_inference/qwen2_5_omni/run_single_prompt.sh"
- ``````
+Please refer to [README.md](https://github.com/vllm-project/vllm/tree/main/README.md)
+## Run examples (Qwen2.5-omni)
### Multiple Prompts
-Download dataset from [seed_tts](https://drive.google.com/file/d/1GlSjVfSHkW3-leKKBlfrjuuTGqQ_xaLP/edit) and extract the prompts with `extract_prompts.py`
-??? abstract "extract_prompts.py"
- ``````py
- --8<-- "examples/offline_inference/qwen2_5_omni/extract_prompts.py"
- ``````
+Download dataset from [seed_tts](https://drive.google.com/file/d/1GlSjVfSHkW3-leKKBlfrjuuTGqQ_xaLP/edit). To get the prompt, you can:
```bash
tar -xf /seedtts_testset.tar
cp seedtts_testset/en/meta.lst examples/offline_inference/qwen2_5_omni/meta.lst
python3 examples/offline_inference/qwen2_5_omni/extract_prompts.py \
--input examples/offline_inference/qwen2_5_omni/meta.lst \
--output examples/offline_inference/qwen2_5_omni/top100.txt \
- --topk 10
+ --topk 100
+```
+Get into the example folder
+```bash
+cd examples/offline_inference/qwen2_5_omni
```
Then run the command below.
```bash
bash run_multiple_prompts.sh
```
-??? abstract "run_multiple_prompts.sh"
- ``````sh
- --8<-- "examples/offline_inference/qwen2_5_omni/run_multiple_prompts.sh"
- ``````
+### Single Prompt
+Get into the example folder
+```bash
+cd examples/offline_inference/qwen2_5_omni
+```
+Then run the command below.
+```bash
+bash run_single_prompt.sh
+```
### FAQ
@@ -58,3 +43,22 @@ If you encounter error about backend of librosa, try to install ffmpeg with comm
sudo apt update
sudo apt install ffmpeg
```
+
+## Example materials
+
+??? abstract "end2end.py"
+ ``````py
+ --8<-- "/mnt/vllm_open_release/vllm-omni-cursor/vllm-omni/examples/offline_inference/qwen2_5_omni/end2end.py"
+ ``````
+??? abstract "extract_prompts.py"
+ ``````py
+ --8<-- "/mnt/vllm_open_release/vllm-omni-cursor/vllm-omni/examples/offline_inference/qwen2_5_omni/extract_prompts.py"
+ ``````
+??? abstract "run_multiple_prompts.sh"
+ ``````sh
+ --8<-- "/mnt/vllm_open_release/vllm-omni-cursor/vllm-omni/examples/offline_inference/qwen2_5_omni/run_multiple_prompts.sh"
+ ``````
+??? abstract "run_single_prompt.sh"
+ ``````sh
+ --8<-- "/mnt/vllm_open_release/vllm-omni-cursor/vllm-omni/examples/offline_inference/qwen2_5_omni/run_single_prompt.sh"
+ ``````
diff --git a/docs/user_guide/examples/offline_inference/qwen3_omni.md b/docs/user_guide/examples/offline_inference/qwen3_omni.md
new file mode 100644
index 0000000000..58ea77b2b3
--- /dev/null
+++ b/docs/user_guide/examples/offline_inference/qwen3_omni.md
@@ -0,0 +1,64 @@
+# Offline Example of vLLM-Omni for Qwen3-omni
+
+Source .
+
+
+## đ ī¸ Installation
+
+Please refer to [README.md](https://github.com/vllm-project/vllm/tree/main/README.md)
+
+## Run examples (Qwen3-omni)
+### Multiple Prompts
+Download dataset from [seed_tts](https://drive.google.com/file/d/1GlSjVfSHkW3-leKKBlfrjuuTGqQ_xaLP/edit). For processing dataset please refer to [Qwen2.5-omni README.md](https://github.com/vllm-project/vllm/tree/main/examples/offline_inference/qwen2_5_omni/README.md)
+Get into the example folder
+```bash
+cd examples/offline_inference/qwen3_omni
+```
+Then run the command below.
+```bash
+bash run_multiple_prompts.sh
+```
+### Single Prompt
+Get into the example folder
+```bash
+cd examples/offline_inference/qwen3_omni
+```
+Then run the command below.
+```bash
+bash run_single_prompt.sh
+```
+If you have not enough memory, you can set thinker with tensor parallel. Just run the command below.
+```bash
+bash run_single_prompt_tp.sh
+```
+
+### FAQ
+
+If you encounter error about backend of librosa, try to install ffmpeg with command below.
+```
+sudo apt update
+sudo apt install ffmpeg
+```
+
+## Example materials
+
+??? abstract "end2end.py"
+ ``````py
+ --8<-- "/mnt/vllm_open_release/vllm-omni-cursor/vllm-omni/examples/offline_inference/qwen3_omni/end2end.py"
+ ``````
+??? abstract "qwen3_omni_moe_tp.yaml"
+ ``````yaml
+ --8<-- "/mnt/vllm_open_release/vllm-omni-cursor/vllm-omni/examples/offline_inference/qwen3_omni/qwen3_omni_moe_tp.yaml"
+ ``````
+??? abstract "run_multiple_prompts.sh"
+ ``````sh
+ --8<-- "/mnt/vllm_open_release/vllm-omni-cursor/vllm-omni/examples/offline_inference/qwen3_omni/run_multiple_prompts.sh"
+ ``````
+??? abstract "run_single_prompt.sh"
+ ``````sh
+ --8<-- "/mnt/vllm_open_release/vllm-omni-cursor/vllm-omni/examples/offline_inference/qwen3_omni/run_single_prompt.sh"
+ ``````
+??? abstract "run_single_prompt_tp.sh"
+ ``````sh
+ --8<-- "/mnt/vllm_open_release/vllm-omni-cursor/vllm-omni/examples/offline_inference/qwen3_omni/run_single_prompt_tp.sh"
+ ``````
diff --git a/docs/user_guide/examples/offline_inference/qwen_image.md b/docs/user_guide/examples/offline_inference/qwen_image.md
index 904a2c0910..ce2c29b2a1 100644
--- a/docs/user_guide/examples/offline_inference/qwen_image.md
+++ b/docs/user_guide/examples/offline_inference/qwen_image.md
@@ -1,25 +1,15 @@
-# Offline Inference Example of vLLM-Omni for Qwen-Image
+# Qwen-Image Offline Inference
-Source .
+Source .
-## đ ī¸ Installation
-
-Please refer to [installation](../../../getting_started/installation/README.md).
-
-## Run Offline inference with Qwen-Image
-
-First, navigate to the example folder
-```bash
-cd examples/offline_inference/qwen_image
-```
-
This folder provides two simple entrypoints for experimenting with `Qwen/Qwen-Image` using vLLM-Omni:
- `text_to_image.py`: command-line script for single image generation.
-- `gradio_demo.py`: lightweight Gradio UI for interactive prompt/seed/CFG exploration.
+- `web_demo.py`: lightweight Gradio UI for interactive prompt/seed/CFG exploration.
-### Command-line Usage
+
+## Local CLI Usage
```bash
python text_to_image.py \
@@ -43,14 +33,9 @@ Key arguments:
- `--height/--width`: output resolution (defaults 1024x1024).
- `--output`: path to save the generated PNG.
-??? abstract "text_to_image.py"
- ``````py
- --8<-- "examples/offline_inference/qwen_image/text_to_image.py"
- ``````
-
> âšī¸ Qwen-Image currently publishes best-effort presets at `1328x1328`, `1664x928`, `928x1664`, `1472x1140`, `1140x1472`, `1584x1056`, and `1056x1584`. Adjust `--height/--width` accordingly for the most reliable outcomes.
-### Web UI Demo
+## Web UI Demo
Launch the gradio demo:
@@ -60,7 +45,13 @@ python gradio_demo.py --port 7862
Then open `http://localhost:7862/` on your local browser to interact with the web UI.
+## Example materials
+
??? abstract "gradio_demo.py"
``````py
- --8<-- "examples/offline_inference/qwen_image/gradio_demo.py"
+ --8<-- "/mnt/vllm_open_release/vllm-omni-cursor/vllm-omni/examples/offline_inference/qwen_image/gradio_demo.py"
+ ``````
+??? abstract "text_to_image.py"
+ ``````py
+ --8<-- "/mnt/vllm_open_release/vllm-omni-cursor/vllm-omni/examples/offline_inference/qwen_image/text_to_image.py"
``````
diff --git a/docs/user_guide/examples/online_serving/qwen2_5_omni.md b/docs/user_guide/examples/online_serving/qwen2_5_omni.md
index 96c283d168..7918c5f841 100644
--- a/docs/user_guide/examples/online_serving/qwen2_5_omni.md
+++ b/docs/user_guide/examples/online_serving/qwen2_5_omni.md
@@ -1,43 +1,38 @@
-# Online Serving Example of vLLM-Omni for Qwen2.5-Omni
+# Online serving Example of vLLM-Omni for Qwen2.5-omni
-Source .
+Source .
## đ ī¸ Installation
-Please refer to [installation](../../../getting_started/installation/README.md).
+Please refer to [README.md](https://github.com/vllm-project/vllm/tree/main/examples/README.md)
-## Deploy Qwen/Qwen2.5-Omni-7B
+## Run examples (Qwen2.5-omni)
-First launch the OpenAI-compatible inference server
+Launch the server
```bash
vllm serve Qwen/Qwen2.5-Omni-7B --omni --port 8091
```
+
If you have custom stage configs file, launch the server with command below
```bash
vllm serve Qwen/Qwen2.5-Omni-7B --omni --port 8091 --stage-configs-path /path/to/stage_configs_file
```
-## Query the model
-Navigate to the example folder
+
+Get into the example folder
```bash
cd examples/online_serving
```
-Query the model server with OpenAI Python API client:
+
+Send request via python
```bash
python openai_chat_completion_client_for_multimodal_generation.py --query-type mixed_modalities
```
-??? abstract "openai_chat_completion_client_for_multimodal_generation.py"
- ``````py
- --8<-- "examples/online_serving/openai_chat_completion_client_for_multimodal_generation.py"
- ``````
-You can also query the model with `curl` command:
+
+Send request via curl
```bash
bash run_curl_multimodal_generation.sh mixed_modalities
```
-??? abstract "run_curl_multimodal_generation.sh"
- ``````py
- --8<-- "examples/online_serving/run_curl_multimodal_generation.sh"
- ``````
### FAQ
@@ -49,16 +44,17 @@ sudo apt install ffmpeg
## Run Local Web UI Demo
-You can also deploy a Gradio Web UI that allows users to interact with the model through a web browser. Below is an example on how to do so with `Qwen/Qwen2.5-Omni-7B`.
+This Web UI demo allows users to interact with the model through a web browser.
### Running Gradio Demo
-Install gradio with `uv pip install "gradio>=5.49.1,<6.0.0"`, then you can launch the web service built on AsyncOmni by
+Once vllm and vllm-omni are installed, you can launch the web service built on AsyncOmniLLM by
```bash
python gradio_demo.py --model Qwen/Qwen2.5-Omni-7B --port 7861
```
-Now you can interact with model via the web UI at `http://localhost:7861/` on your local browser.
+
+Then open `http://localhost:7861/` on your local browser to interact with the web UI.
### Options
@@ -78,3 +74,18 @@ python gradio_demo.py \
- `--port`: Port for the Gradio server (default `7861`).
- `--stage-configs-path`: Optional path to custom stage configs YAML.
- `--share`: Set to expose a temporary public link via Gradio.
+
+## Example materials
+
+??? abstract "gradio_demo.py"
+ ``````py
+ --8<-- "/mnt/vllm_open_release/vllm-omni-cursor/vllm-omni/examples/online_serving/qwen2_5_omni/gradio_demo.py"
+ ``````
+??? abstract "openai_chat_completion_client_for_multimodal_generation.py"
+ ``````py
+ --8<-- "/mnt/vllm_open_release/vllm-omni-cursor/vllm-omni/examples/online_serving/qwen2_5_omni/openai_chat_completion_client_for_multimodal_generation.py"
+ ``````
+??? abstract "run_curl_multimodal_generation.sh"
+ ``````sh
+ --8<-- "/mnt/vllm_open_release/vllm-omni-cursor/vllm-omni/examples/online_serving/qwen2_5_omni/run_curl_multimodal_generation.sh"
+ ``````
diff --git a/docs/user_guide/examples/online_serving/qwen3_omni.md b/docs/user_guide/examples/online_serving/qwen3_omni.md
new file mode 100644
index 0000000000..3d1ba770bc
--- /dev/null
+++ b/docs/user_guide/examples/online_serving/qwen3_omni.md
@@ -0,0 +1,95 @@
+# Online serving Example of vLLM-Omni for Qwen3-omni
+
+Source .
+
+
+## đ ī¸ Installation
+
+Please refer to [README.md](https://github.com/vllm-project/vllm/tree/main/examples/README.md)
+
+## Run examples (Qwen3-Omni)
+
+Launch the server
+```bash
+vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091
+```
+
+If you have custom stage configs file, launch the server with command below
+```bash
+vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 --stage-configs-path /path/to/stage_configs_file
+```
+
+Get into the example folder
+```bash
+cd examples/online_serving
+```
+
+Send request via python
+```bash
+python openai_chat_completion_client_for_multimodal_generation.py --query-type mixed_modalities
+```
+
+Send request via curl
+```bash
+bash run_curl_multimodal_generation.sh mixed_modalities
+```
+
+### FAQ
+
+If you encounter error about backend of librosa, try to install ffmpeg with command below.
+```
+sudo apt update
+sudo apt install ffmpeg
+```
+
+## Run Local Web UI Demo
+
+This Web UI demo allows users to interact with the model through a web browser.
+
+### Running Gradio Demo
+
+Once vllm and vllm-omni are installed, you can launch the web service built on AsyncOmniLLM by
+
+```bash
+python gradio_demo.py --model Qwen/Qwen3-Omni-30B-A3B-Instruct --port 7861
+```
+
+Then open `http://localhost:7861/` on your local browser to interact with the web UI.
+
+
+### Options
+
+The gradio demo also supports running with an existing API server and can be customized with the following arguments.
+
+
+```bash
+python gradio_demo.py \
+ --model Qwen/Qwen3-Omni-30B-A3B-Instruct \
+ --use-api-server \
+ --api-base http://localhost:8091/v1 \
+ --ip 127.0.0.1 \
+ --port 7861
+```
+
+- `--model`: Model name
+- `--use-api-server`: If set, connect to an existing vLLM HTTP API server instead of running AsyncOmniLLM locally.
+- `--api-base`: Base URL for vllm serve (only used when `use-api-server` is set, default: http://localhost:8091/v1)
+- `--ip`: Host/IP for Gradio server (default: 127.0.0.1)
+- `--port`: Port for Gradio server (default: 7861)
+- `--stage-configs-path`: Path to custom stage configs YAML file (optional)
+- `--share`: Share the Gradio demo publicly (creates a public link)
+
+## Example materials
+
+??? abstract "gradio_demo.py"
+ ``````py
+ --8<-- "/mnt/vllm_open_release/vllm-omni-cursor/vllm-omni/examples/online_serving/qwen3_omni/gradio_demo.py"
+ ``````
+??? abstract "openai_chat_completion_client_for_multimodal_generation.py"
+ ``````py
+ --8<-- "/mnt/vllm_open_release/vllm-omni-cursor/vllm-omni/examples/online_serving/qwen3_omni/openai_chat_completion_client_for_multimodal_generation.py"
+ ``````
+??? abstract "run_curl_multimodal_generation.sh"
+ ``````sh
+ --8<-- "/mnt/vllm_open_release/vllm-omni-cursor/vllm-omni/examples/online_serving/qwen3_omni/run_curl_multimodal_generation.sh"
+ ``````
diff --git a/examples/offline_inference/qwen2_5_omni/README.md b/examples/offline_inference/qwen2_5_omni/README.md
index afdaee13ea..4c52925991 100644
--- a/examples/offline_inference/qwen2_5_omni/README.md
+++ b/examples/offline_inference/qwen2_5_omni/README.md
@@ -23,7 +23,7 @@ Then run the command below.
```bash
bash run_multiple_prompts.sh
```
-### Single Prompts
+### Single Prompt
Get into the example folder
```bash
cd examples/offline_inference/qwen2_5_omni
diff --git a/examples/offline_inference/qwen2_5_omni/end2end.py b/examples/offline_inference/qwen2_5_omni/end2end.py
index c5be18a94a..463db0aa89 100644
--- a/examples/offline_inference/qwen2_5_omni/end2end.py
+++ b/examples/offline_inference/qwen2_5_omni/end2end.py
@@ -16,7 +16,7 @@
from vllm.sampling_params import SamplingParams
from vllm.utils import FlexibleArgumentParser
-from vllm_omni.entrypoints.omni_llm import OmniLLM
+from vllm_omni.entrypoints.omni import Omni
SEED = 42
@@ -140,7 +140,7 @@ def main(args):
model_name = "Qwen/Qwen2.5-Omni-7B"
query_result = query_map[args.query_type]()
- omni_llm = OmniLLM(
+ omni_llm = Omni(
model=model_name,
log_stats=args.enable_stats,
log_file=("omni_llm_pipeline.log" if args.enable_stats else None),
diff --git a/examples/offline_inference/qwen2_5_omni/run_single_prompt.sh b/examples/offline_inference/qwen2_5_omni/run_single_prompt.sh
index 5b3c19cdc2..2f2b3ae756 100644
--- a/examples/offline_inference/qwen2_5_omni/run_single_prompt.sh
+++ b/examples/offline_inference/qwen2_5_omni/run_single_prompt.sh
@@ -1,2 +1,2 @@
python end2end.py --output-wav output_audio \
- --query-type use_audio_in_video
+ --query-type text
diff --git a/examples/offline_inference/qwen3_omni/README.md b/examples/offline_inference/qwen3_omni/README.md
index 766453667f..22c7651632 100644
--- a/examples/offline_inference/qwen3_omni/README.md
+++ b/examples/offline_inference/qwen3_omni/README.md
@@ -1,4 +1,4 @@
-# Offline Example of vLLM-Omni for Qwen2.5-omni
+# Offline Example of vLLM-Omni for Qwen3-omni
## đ ī¸ Installation
@@ -15,7 +15,7 @@ Then run the command below.
```bash
bash run_multiple_prompts.sh
```
-### Single Prompts
+### Single Prompt
Get into the example folder
```bash
cd examples/offline_inference/qwen3_omni
@@ -24,6 +24,10 @@ Then run the command below.
```bash
bash run_single_prompt.sh
```
+If you have not enough memory, you can set thinker with tensor parallel. Just run the command below.
+```bash
+bash run_single_prompt_tp.sh
+```
### FAQ
diff --git a/examples/offline_inference/qwen3_omni/end2end.py b/examples/offline_inference/qwen3_omni/end2end.py
index c1fa21a621..54f74c6097 100644
--- a/examples/offline_inference/qwen3_omni/end2end.py
+++ b/examples/offline_inference/qwen3_omni/end2end.py
@@ -16,7 +16,7 @@
from vllm.multimodal.image import convert_image_mode
from vllm.utils import FlexibleArgumentParser
-from vllm_omni.entrypoints.omni_llm import OmniLLM
+from vllm_omni.entrypoints.omni import Omni
SEED = 42
@@ -127,8 +127,9 @@ def main(args):
model_name = "Qwen/Qwen3-Omni-30B-A3B-Instruct"
query_result = query_map[args.query_type]()
- omni_llm = OmniLLM(
+ omni_llm = Omni(
model=model_name,
+ stage_configs_path=args.stage_configs_path,
)
thinker_sampling_params = SamplingParams(
@@ -276,6 +277,12 @@ def parse_args():
default=None,
help="Path to a .txt file with one prompt per line (preferred).",
)
+ parser.add_argument(
+ "--stage-configs-path",
+ type=str,
+ default=None,
+ help="Path to a stage configs file.",
+ )
return parser.parse_args()
@@ -283,7 +290,3 @@ def parse_args():
if __name__ == "__main__":
args = parse_args()
main(args)
-
- # use examples:
- # python end2end.py --model Qwen/Qwen3-Omni-30B-A3B-Instruct --query-type text
- # python end2end.py --model /custom_path/Qwen3-Omni-30B-A3B-Instruct --query-type use_video_only
diff --git a/examples/offline_inference/qwen3_omni/qwen3_omni_moe_tp.yaml b/examples/offline_inference/qwen3_omni/qwen3_omni_moe_tp.yaml
new file mode 100644
index 0000000000..8d9c2900e6
--- /dev/null
+++ b/examples/offline_inference/qwen3_omni/qwen3_omni_moe_tp.yaml
@@ -0,0 +1,94 @@
+# Stage config for running Qwen3-Omni-MoE with 3-stage architecture
+# Stage 0: Thinker (multimodal understanding + text generation)
+# Stage 1: Talker (text embeddings â 8-layer RVQ codec codes)
+# Stage 2: Code2Wav (8-layer RVQ codes â audio waveform)
+
+stage_args:
+ - stage_id: 0
+ runtime:
+ devices: "0,1"
+ max_batch_size: 1
+ engine_args:
+ model_stage: thinker
+ model_arch: Qwen3OmniMoeForConditionalGeneration
+ worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker
+ scheduler_cls: vllm_omni.core.sched.scheduler.OmniScheduler
+ gpu_memory_utilization: 0.8
+ enforce_eager: true
+ trust_remote_code: true
+ engine_output_type: latent # Output hidden states for talker
+ distributed_executor_backend: "mp"
+ enable_prefix_caching: false
+ hf_config_name: thinker_config
+ tensor_parallel_size: 2
+ final_output: true
+ final_output_type: text
+ is_comprehension: true
+ default_sampling_params:
+ temperature: 0.4
+ top_p: 0.9
+ top_k: 1
+ max_tokens: 2048
+ seed: 42
+ detokenize: True
+ repetition_penalty: 1.05
+
+ - stage_id: 1
+ runtime:
+ devices: "1"
+ max_batch_size: 1
+ engine_args:
+ model_stage: talker
+ model_arch: Qwen3OmniMoeForConditionalGeneration
+ worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker
+ scheduler_cls: vllm_omni.core.sched.scheduler.OmniScheduler
+ gpu_memory_utilization: 0.5
+ enforce_eager: true
+ trust_remote_code: true
+ engine_output_type: latent # Output codec codes for code2wav
+ # tensor_parallel_size: 2
+ enable_prefix_caching: false
+ distributed_executor_backend: "mp"
+ hf_config_name: talker_config
+ engine_input_source: [0]
+ custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker
+ # final_output: true
+ # final_output_type: text
+ default_sampling_params:
+ temperature: 0.9
+ top_k: 50
+ max_tokens: 4096
+ seed: 42
+ detokenize: False
+ repetition_penalty: 1.05
+ stop_token_ids: [2150]
+
+ - stage_id: 2
+ runtime:
+ devices: "1"
+ max_batch_size: 1
+ engine_args:
+ model_stage: code2wav
+ model_arch: Qwen3OmniMoeForConditionalGeneration
+ worker_cls: vllm_omni.worker.gpu_generation_worker.GPUGenerationWorker
+ scheduler_cls: vllm_omni.core.sched.generation_scheduler.GenerationScheduler
+ enforce_eager: true
+ trust_remote_code: true
+ enable_prefix_caching: false
+ engine_output_type: audio # Final output: audio waveform
+ gpu_memory_utilization: 0.1
+ distributed_executor_backend: "mp"
+ max_num_batched_tokens: 1000000
+ hf_config_name: thinker_config
+ engine_input_source: [1]
+ custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav
+ final_output: true
+ final_output_type: audio
+ default_sampling_params:
+ temperature: 0.0
+ top_p: 1.0
+ top_k: -1
+ max_tokens: 65536
+ seed: 42
+ detokenize: True
+ repetition_penalty: 1.1
diff --git a/examples/offline_inference/qwen3_omni/run_single_prompt_tp.sh b/examples/offline_inference/qwen3_omni/run_single_prompt_tp.sh
new file mode 100644
index 0000000000..eb25b6bc05
--- /dev/null
+++ b/examples/offline_inference/qwen3_omni/run_single_prompt_tp.sh
@@ -0,0 +1,6 @@
+python end2end.py --output-wav output_audio \
+ --query-type use_audio \
+ --init-sleep-seconds 90 \
+ --stage-configs-path qwen3_omni_moe_tp.yaml
+
+# init-sleep-seconds works to avoid two vLLM stages initialized at the same time within a card.
diff --git a/examples/online_serving/qwen3_omni/README.md b/examples/online_serving/qwen3_omni/README.md
index 01a0fdd286..6e4b27ea38 100644
--- a/examples/online_serving/qwen3_omni/README.md
+++ b/examples/online_serving/qwen3_omni/README.md
@@ -1,4 +1,4 @@
-# Online serving Example of vLLM-omni for Qwen2.5-omni
+# Online serving Example of vLLM-Omni for Qwen3-omni
## đ ī¸ Installation
diff --git a/mkdocs.yml b/mkdocs.yml
index dfa29f59bd..31ca350fa4 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -58,6 +58,7 @@ theme:
hooks:
- docs/mkdocs/hooks/generate_api_readme.py
- docs/mkdocs/hooks/url_schemes.py
+ - docs/mkdocs/hooks/generate_examples.py
# Plugins
plugins:
diff --git a/vllm_omni/__init__.py b/vllm_omni/__init__.py
index 89c7cbfc6c..fd7f9174fb 100644
--- a/vllm_omni/__init__.py
+++ b/vllm_omni/__init__.py
@@ -24,7 +24,7 @@
from .entrypoints.async_omni_llm import AsyncOmniLLM
# Main entry points
-# from .entrypoints.omni import Omni
+from .entrypoints.omni import Omni
from .version import __version__, __version_tuple__ # isort:skip
@@ -33,7 +33,7 @@
"__version__",
"__version_tuple__",
# Main components
- # "Omni",
+ "Omni",
"AsyncOmniLLM",
# Configuration
"OmniModelConfig",
diff --git a/vllm_omni/diffusion/models/__init__.py b/vllm_omni/diffusion/models/__init__.py
new file mode 100644
index 0000000000..9e7471a2da
--- /dev/null
+++ b/vllm_omni/diffusion/models/__init__.py
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Diffusion model implementations."""
diff --git a/vllm_omni/diffusion/models/qwen_image/__init__.py b/vllm_omni/diffusion/models/qwen_image/__init__.py
index 69d6821f4f..84fa2259d4 100644
--- a/vllm_omni/diffusion/models/qwen_image/__init__.py
+++ b/vllm_omni/diffusion/models/qwen_image/__init__.py
@@ -1,3 +1,17 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Qwen Image model components."""
+"""Qwen Image diffusion model components."""
+
+from vllm_omni.diffusion.models.qwen_image.pipeline_qwen_image import (
+ QwenImagePipeline,
+ get_qwen_image_post_process_func,
+)
+from vllm_omni.diffusion.models.qwen_image.qwen_image_transformer import (
+ QwenImageTransformer2DModel,
+)
+
+__all__ = [
+ "QwenImagePipeline",
+ "QwenImageTransformer2DModel",
+ "get_qwen_image_post_process_func",
+]
diff --git a/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image.py b/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image.py
index cbedc5c317..69917361a1 100644
--- a/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image.py
+++ b/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image.py
@@ -392,8 +392,6 @@ def encode_prompt(
Args:
prompt (`str` or `list[str]`, *optional*):
prompt to be encoded
- device: (`torch.device`):
- torch device
num_images_per_prompt (`int`):
number of images that should be generated per prompt
prompt_embeds (`torch.Tensor`, *optional*):
diff --git a/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py b/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py
index c0ef5fd123..b2f4e35bdb 100644
--- a/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py
+++ b/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py
@@ -113,10 +113,12 @@ def __init__(self, theta: int, axes_dim: list[int], scale_rope=False):
# DO NOT USING REGISTER BUFFER HERE, IT WILL CAUSE COMPLEX NUMBERS LOSE ITS IMAGINARY PART
self.scale_rope = scale_rope
- def rope_params(self, index, dim, theta=10000):
+ def rope_params(self, index: torch.Tensor, dim: int, theta: int = 10000):
"""
Args:
- index: [0, 1, 2, 3] 1D Tensor representing the position index of the token
+ index (`torch.Tensor`): [0, 1, 2, 3] 1D Tensor representing the position index of the token
+ dim (`int`): Dimension for the rope parameters
+ theta (`int`): Theta parameter for rope
"""
assert dim % 2 == 0
freqs = torch.outer(
diff --git a/vllm_omni/diffusion/worker/__init__.py b/vllm_omni/diffusion/worker/__init__.py
index 068a25f8f9..dc3306dae3 100644
--- a/vllm_omni/diffusion/worker/__init__.py
+++ b/vllm_omni/diffusion/worker/__init__.py
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Diffusion worker components."""
+"""Worker classes for diffusion models."""
from vllm_omni.diffusion.worker.gpu_worker import GPUWorker, WorkerProc
diff --git a/vllm_omni/entrypoints/async_omni_llm.py b/vllm_omni/entrypoints/async_omni_llm.py
index 5046196d05..c45acdc44a 100644
--- a/vllm_omni/entrypoints/async_omni_llm.py
+++ b/vllm_omni/entrypoints/async_omni_llm.py
@@ -94,7 +94,7 @@ def __init__(
shm_threshold_bytes: int = 65536,
batch_timeout: int = 10,
init_timeout: int = 60000,
- **kwargs,
+ **kwargs: Any,
):
self.batch_timeout = batch_timeout
self._enable_stats: bool = bool(log_stats)
diff --git a/vllm_omni/entrypoints/omni_llm.py b/vllm_omni/entrypoints/omni_llm.py
index 4b9b1b4f64..94d93ef767 100644
--- a/vllm_omni/entrypoints/omni_llm.py
+++ b/vllm_omni/entrypoints/omni_llm.py
@@ -534,8 +534,9 @@ def __init__(
self,
model: str,
compilation_config: Optional[Union[int, dict[str, Any], CompilationConfig]] = None,
+ hf_overrides: Optional[dict[str, Any]] = None,
structured_outputs_config: Optional[Union[dict[str, Any], StructuredOutputsConfig]] = None,
- **kwargs,
+ **kwargs: Any,
):
"""LLM constructor."""
if "disable_log_stats" not in kwargs:
diff --git a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_code_predictor_mtp.py b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_code_predictor_mtp.py
index 49215be36d..5c5c5b15d1 100644
--- a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_code_predictor_mtp.py
+++ b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_code_predictor_mtp.py
@@ -6,6 +6,7 @@
autoregressively, predicting layers 1 to N based on layer-0 codes from the talker.
"""
+from collections import namedtuple
from typing import Any, Optional
import torch
@@ -164,21 +165,48 @@ def forward(
from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
- attention_interface = ALL_ATTENTION_FUNCTIONS["flash_attention_2"]
- attn_output, _ = attention_interface(
- self,
- q_heads,
- k_heads,
- v_heads,
- None,
- dropout=0.0 if not self.training else self.attention_dropout,
- scaling=self.head_dim**-0.5,
- sliding_window=None,
- use_cache=use_cache,
- position_ids=position_ids,
- output_hidden_states=True,
- output_attentions=False,
- )
+ # Try attention backends in order of preference, with runtime error handling
+ # This handles cases where the backend is registered but not actually available
+ attention_backends = ["flash_attention_2", "xformers", "eager", "sdpa"]
+ attn_output = None
+ last_error = None
+
+ for backend_name in attention_backends:
+ if backend_name not in ALL_ATTENTION_FUNCTIONS:
+ continue
+
+ try:
+ attention_interface = ALL_ATTENTION_FUNCTIONS[backend_name]
+ attn_output, _ = attention_interface(
+ self,
+ q_heads,
+ k_heads,
+ v_heads,
+ None,
+ dropout=0.0 if not self.training else getattr(self, "attention_dropout", 0.0),
+ scaling=self.head_dim**-0.5,
+ sliding_window=None,
+ use_cache=use_cache,
+ position_ids=position_ids,
+ output_hidden_states=True,
+ output_attentions=False,
+ )
+ # Success - log fallback if not using flash_attention_2
+ if backend_name != "flash_attention_2":
+ logger.warning_once(
+ f"Using {backend_name} attention backend (flash_attention_2 not available or failed)"
+ )
+ break
+ except (ValueError, ImportError, RuntimeError, AttributeError) as e:
+ # Store error and try next backend
+ last_error = e
+ continue
+
+ if attn_output is None:
+ raise RuntimeError(
+ f"All attention backends failed. Last error: {last_error}. "
+ "Please install flash-attn, or ensure PyTorch's scaled_dot_product_attention is available."
+ )
attn_output = attn_output.reshape(*(hidden_states.shape[:-1]), -1).contiguous()
attn_output = self.o_proj(attn_output)
@@ -373,16 +401,22 @@ def forward(
past_key_values: Optional[Any] = None,
use_cache: Optional[bool] = False,
cache_position: Optional[torch.LongTensor] = None,
- **kwargs,
- ):
+ **kwargs: Any,
+ ) -> Any:
"""
Forward pass matching HF structure.
Args:
inputs_embeds: [batch, seq_len, hidden_size]
+ attention_mask: Optional attention mask tensor
+ position_ids: Optional position IDs tensor
+ past_key_values: Optional cached key-value pairs
+ use_cache: Whether to use cache
+ cache_position: Optional cache position tensor
+ **kwargs: Additional keyword arguments
Returns:
- Object with .last_hidden_state attribute
+ Named tuple with .last_hidden_state and .past_key_values attributes
"""
batch_size, seq_len, _ = inputs_embeds.shape
@@ -421,8 +455,6 @@ def forward(
hidden_states = self.norm(hidden_states)
# Return in HF-compatible format
- from collections import namedtuple
-
Output = namedtuple("Output", ["last_hidden_state", "past_key_values"])
return Output(last_hidden_state=hidden_states, past_key_values=None) # [batch, num_code_groups-1, hidden_size]
diff --git a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_talker.py b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_talker.py
index 34e05925f9..91231d0fbc 100644
--- a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_talker.py
+++ b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_talker.py
@@ -1,4 +1,5 @@
from collections.abc import Iterable
+from typing import Any
import torch
import torch.nn as nn
@@ -284,7 +285,7 @@ def code_predictor_forward(
return result_codes, summed_embeddings
- def init_multi_modal(self, thinker_config):
+ def init_multi_modal(self, thinker_config: Any) -> None:
"""
Initialize multimodal components from the thinker.
diff --git a/vllm_omni/model_executor/stage_input_processors/qwen3_omni.py b/vllm_omni/model_executor/stage_input_processors/qwen3_omni.py
index e2d07aa1bb..b3b5500ef1 100644
--- a/vllm_omni/model_executor/stage_input_processors/qwen3_omni.py
+++ b/vllm_omni/model_executor/stage_input_processors/qwen3_omni.py
@@ -3,7 +3,7 @@
# Copyright 2025 The Qwen team.
"""Stage input processor for Qwen3 Omni MoE: Thinker â Talker transition."""
-from typing import Union
+from typing import Any, Union
import torch
from vllm.inputs import TextPrompt
@@ -47,11 +47,11 @@ def _compute_talker_prompt_ids_length(info):
def thinker2talker(
- stage_list,
- engine_input_source,
- prompt: Union[OmniTokensPrompt, TextPrompt] = None,
+ stage_list: list[Any],
+ engine_input_source: list[int],
+ prompt: Union[OmniTokensPrompt, TextPrompt, None] = None,
requires_multimodal_data: bool = False,
-):
+) -> list[OmniTokensPrompt]:
"""
Process thinker outputs to create talker inputs.
@@ -64,6 +64,7 @@ def thinker2talker(
stage_list: List of stage objects
engine_input_source: Source stage IDs (typically [0] for thinker)
prompt: Original prompt data
+ requires_multimodal_data: Whether multimodal data is required
Returns:
List of OmniTokensPrompt for talker stage
@@ -111,11 +112,11 @@ def thinker2talker(
def talker2code2wav(
- stage_list,
- engine_input_source,
- prompt: Union[OmniTokensPrompt, TextPrompt] = None,
+ stage_list: list[Any],
+ engine_input_source: list[int],
+ prompt: Union[OmniTokensPrompt, TextPrompt, None] = None,
requires_multimodal_data: bool = False,
-):
+) -> list[OmniTokensPrompt]:
"""
Process talker outputs to create code2wav inputs.
@@ -128,6 +129,7 @@ def talker2code2wav(
stage_list: List of stage objects
engine_input_source: Source stage IDs (typically [1] for talker)
prompt: Original prompt data
+ requires_multimodal_data: Whether multimodal data is required
Returns:
List of OmniTokensPrompt for code2wav stage