diff --git a/.gitignore b/.gitignore index e38e2dc747..1c15c43954 100644 --- a/.gitignore +++ b/.gitignore @@ -162,9 +162,6 @@ docker/ # scripts /scripts/ -# tests -tests/ - # PyCharm # JetBrains specific template is maintained in a separate JetBrains.gitignore that can # be added to the global gitignore or merged into this project gitignore. For a PyCharm diff --git a/docs/.nav.yml b/docs/.nav.yml index fb9c7fb443..7bada971c6 100644 --- a/docs/.nav.yml +++ b/docs/.nav.yml @@ -1,44 +1,46 @@ nav: - - Home: README.md - - User Guide: - - Getting Started: - - getting_started/quickstart.md - - getting_started/installation - - Examples: - - examples/README.md - - Offline Inference: - - Qwen2.5-Omni: user_guide/examples/offline_inference/qwen2_5_omni.md - - Qwen2.5-Image: user_guide/examples/offline_inference/qwen_image.md - - Online Serving: - - Qwen2.5-Omni: user_guide/examples/online_serving/qwen2_5_omni.md - - General: - - usage/* - - Configuration: - - configuration/README.md - - configuration/* - - Models: - - models/supported_models.md - - Developer Guide: - - General: - - contributing/README.md - - glob: contributing/* - flatten_single_child_sections: true - - Model Implementation: - - contributing/model/README.md - - CI: contributing/ci - - Design Documents: - - design/index.md - - design/architecture_overview.md - - design/vllm_omni_design.md - - design/mrs_design.md - - design/api_design_doc.md - - Docs Guide: contributing/DOCS_GUIDE.md - - API Reference: - - api/README.md - - api/vllm_omni - - CLI Reference: cli - - Community: - - community/* - - Slack: https://slack.vllm.ai - - Blog: https://blog.vllm.ai - - Forum: https://discuss.vllm.ai +- Home: README.md +- User Guide: + - Getting Started: + - getting_started/quickstart.md + - getting_started/installation + - Examples: + - examples/README.md + - Offline Inference: + - Offline Example of vLLM-Omni for Qwen2.5-omni: user_guide/examples/offline_inference/qwen2_5_omni.md + - Offline Example of vLLM-Omni for Qwen3-omni: user_guide/examples/offline_inference/qwen3_omni.md + - Qwen-Image Offline Inference: user_guide/examples/offline_inference/qwen_image.md + - Online Serving: + - Online serving Example of vLLM-Omni for Qwen2.5-omni: user_guide/examples/online_serving/qwen2_5_omni.md + - Online serving Example of vLLM-Omni for Qwen3-omni: user_guide/examples/online_serving/qwen3_omni.md + - General: + - usage/* + - Configuration: + - configuration/README.md + - configuration/* + - Models: + - models/supported_models.md +- Developer Guide: + - General: + - contributing/README.md + - glob: contributing/* + flatten_single_child_sections: true + - Model Implementation: + - contributing/model/README.md + - CI: contributing/ci + - Design Documents: + - design/index.md + - design/architecture_overview.md + - design/vllm_omni_design.md + - design/mrs_design.md + - design/api_design_doc.md + - Docs Guide: contributing/DOCS_GUIDE.md +- API Reference: + - api/README.md + - api/vllm_omni +- CLI Reference: cli +- Community: + - community/* + - Slack: https://slack.vllm.ai + - Blog: https://blog.vllm.ai + - Forum: https://discuss.vllm.ai diff --git a/docs/api/README.md b/docs/api/README.md index 100e17ad44..bb4e9d8777 100644 --- a/docs/api/README.md +++ b/docs/api/README.md @@ -34,6 +34,8 @@ Engine classes for offline and online inference. - [vllm_omni.diffusion.diffusion_engine.DiffusionEngine][] - [vllm_omni.engine.AdditionalInformationEntry][] - [vllm_omni.engine.AdditionalInformationPayload][] +- [vllm_omni.engine.OmniEngineCoreOutput][] +- [vllm_omni.engine.OmniEngineCoreOutputs][] - [vllm_omni.engine.OmniEngineCoreRequest][] - [vllm_omni.engine.PromptEmbedsPayload][] - [vllm_omni.engine.arg_utils.AsyncOmniEngineArgs][] @@ -48,6 +50,7 @@ Core scheduling and caching components. - [vllm_omni.core.dit_cache_manager.DiTCacheManager][] - [vllm_omni.core.sched.diffusion_scheduler.DiffusionScheduler][] +- [vllm_omni.core.sched.generation_scheduler.GenerationScheduler][] - [vllm_omni.core.sched.output.OmniNewRequestData][] - [vllm_omni.core.sched.scheduler.OmniScheduler][] @@ -55,7 +58,7 @@ Core scheduling and caching components. Model execution components. -- [vllm_omni.model_executor.models.qwen2_5_omni.qwen2_5_omni.OmniOutput][] +- [vllm_omni.model_executor.models.output_templates.OmniOutput][] - [vllm_omni.model_executor.models.qwen2_5_omni.qwen2_5_omni.Qwen2_5OmniForConditionalGeneration][] - [vllm_omni.model_executor.models.qwen2_5_omni.qwen2_5_omni_talker.Qwen2_5OmniTalkerForConditionalGeneration][] - [vllm_omni.model_executor.models.qwen2_5_omni.qwen2_5_omni_thinker.Qwen2_5OmniAudioFeatureInputs][] @@ -72,6 +75,22 @@ Model execution components. - [vllm_omni.model_executor.models.qwen2_5_omni.qwen2_old.Qwen2EmbeddingModel][] - [vllm_omni.model_executor.models.qwen2_5_omni.qwen2_old.Qwen2ForCausalLM][] - [vllm_omni.model_executor.models.qwen2_5_omni.qwen2_old.Qwen2Model][] +- [vllm_omni.model_executor.models.qwen3_omni.qwen3_moe.Qwen3MoeForCausalLM][] +- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni.Qwen3OmniMoeForConditionalGeneration][] +- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_code2wav.Qwen3OmniMoeCode2Wav][] +- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_code_predictor_mtp.Qwen3OmniCodePredictorBaseModel][] +- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_code_predictor_mtp.Qwen3OmniMoeTalkerCodePredictor][] +- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_talker.Qwen3OmniMoeModel][] +- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_talker.Qwen3OmniMoeTalkerForConditionalGeneration][] +- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_thinker.Qwen3MoeLLMForCausalLM][] +- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_thinker.Qwen3MoeLLMModel][] +- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_thinker.Qwen3OmniMoeConditionalGenerationMixin][] +- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_thinker.Qwen3OmniMoeThinkerForConditionalGeneration][] +- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_thinker.Qwen3OmniMoeThinkerMultiModalProcessor][] +- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_thinker.Qwen3OmniMoeThinkerProcessingInfo][] +- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_thinker.Qwen3Omni_VisionTransformer][] +- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_thinker.Qwen3_VisionPatchEmbed][] +- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_thinker.Qwen3_VisionPatchMerger][] ## Configuration @@ -89,4 +108,6 @@ Worker classes and model runners for distributed inference. - [vllm_omni.worker.gpu_ar_worker.GPUARWorker][] - [vllm_omni.worker.gpu_diffusion_model_runner.GPUDiffusionModelRunner][] - [vllm_omni.worker.gpu_diffusion_worker.GPUDiffusionWorker][] +- [vllm_omni.worker.gpu_generation_model_runner.GPUGenerationModelRunner][] +- [vllm_omni.worker.gpu_generation_worker.GPUGenerationWorker][] - [vllm_omni.worker.gpu_model_runner.OmniGPUModelRunner][] diff --git a/docs/mkdocs/hooks/generate_examples.py b/docs/mkdocs/hooks/generate_examples.py new file mode 100644 index 0000000000..46bea983f2 --- /dev/null +++ b/docs/mkdocs/hooks/generate_examples.py @@ -0,0 +1,335 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import itertools +import logging +from dataclasses import dataclass, field +from pathlib import Path +from typing import Literal + +import regex as re +import yaml + +logger = logging.getLogger("mkdocs") + +ROOT_DIR = Path(__file__).parent.parent.parent.parent +ROOT_DIR_RELATIVE = "../../../../.." +EXAMPLE_DIR = ROOT_DIR / "examples" +EXAMPLE_DOC_DIR = ROOT_DIR / "docs/user_guide/examples" +NAV_FILE = ROOT_DIR / "docs/.nav.yml" + + +def fix_case(text: str) -> str: + subs = { + "api": "API", + "cli": "CLI", + "cpu": "CPU", + "llm": "LLM", + "mae": "MAE", + "tpu": "TPU", + "gguf": "GGUF", + "lora": "LoRA", + "rlhf": "RLHF", + "vllm": "vLLM", + "openai": "OpenAI", + "lmcache": "LMCache", + "multilora": "MultiLoRA", + "mlpspeculator": "MLPSpeculator", + r"fp\d+": lambda x: x.group(0).upper(), # e.g. fp16, fp32 + r"int\d+": lambda x: x.group(0).upper(), # e.g. int8, int16 + } + for pattern, repl in subs.items(): + text = re.sub(rf"\b{pattern}\b", repl, text, flags=re.IGNORECASE) + return text + + +@dataclass +class Example: + """ + Example class for generating documentation content from a given path. + + Attributes: + path (Path): The path to the main directory or file. + category (str): The category of the document. + main_file (Path): The main file in the directory. + other_files (list[Path]): list of other files in the directory. + title (str): The title of the document. + + Methods: + __post_init__(): Initializes the main_file, other_files, and title attributes. + determine_main_file() -> Path: Determines the main file in the given path. + determine_other_files() -> list[Path]: Determines other files in the directory excluding the main file. + determine_title() -> str: Determines the title of the document. + generate() -> str: Generates the documentation content. + """ # noqa: E501 + + path: Path + category: str = None + main_file: Path = field(init=False) + other_files: list[Path] = field(init=False) + title: str = field(init=False) + + def __post_init__(self): + self.main_file = self.determine_main_file() + self.other_files = self.determine_other_files() + self.title = self.determine_title() + + @property + def is_code(self) -> bool: + return self.main_file.suffix != ".md" + + def determine_main_file(self) -> Path: + """ + Determines the main file in the given path. + If the path is a file, it returns the path itself. Otherwise, it searches + for Markdown files (*.md) in the directory and returns the first one found. + Returns: + Path: The main file path, either the original path if it's a file or the first + Markdown file found in the directory. + Raises: + IndexError: If no Markdown files are found in the directory. + """ # noqa: E501 + return self.path if self.path.is_file() else list(self.path.glob("*.md")).pop() + + def determine_other_files(self) -> list[Path]: + """ + Determine other files in the directory excluding the main file. + + This method checks if the given path is a file. If it is, it returns an empty list. + Otherwise, it recursively searches through the directory and returns a list of all + files that are not the main file. + + Returns: + list[Path]: A list of Path objects representing the other files in the directory. + """ # noqa: E501 + if self.path.is_file(): + return [] + # Binary file extensions to exclude + binary_extensions = { + ".wav", + ".mp3", + ".mp4", + ".avi", + ".mov", + ".mkv", # Audio/Video + ".png", + ".jpg", + ".jpeg", + ".gif", + ".bmp", + ".ico", + ".svg", # Images + ".pdf", + ".zip", + ".tar", + ".gz", + ".bz2", + ".xz", # Archives/Documents + ".exe", + ".so", + ".dll", + ".dylib", # Binaries + ".bin", + ".dat", + ".db", + ".sqlite", # Data files + } + + def is_other_file(file: Path) -> bool: + return file.is_file() and file != self.main_file and file.suffix.lower() not in binary_extensions + + return [file for file in self.path.rglob("*") if is_other_file(file)] + + def determine_title(self) -> str: + if not self.is_code: + # Specify encoding for building on Windows + with open(self.main_file, encoding="utf-8") as f: + first_line = f.readline().strip() + match = re.match(r"^#\s+(?P.+)$", first_line) + if match: + return match.group("title") + return fix_case(self.path.stem.replace("_", " ").title()) + + def fix_relative_links(self, content: str) -> str: + """ + Fix relative links in markdown content by converting them to gh-file + format. + + Args: + content (str): The markdown content to process + + Returns: + str: Content with relative links converted to gh-file format + """ + # Regex to match markdown links [text](relative_path) + # This matches links that don't start with http, https, ftp, or # + link_pattern = r"\[([^\]]*)\]\((?!(?:https?|ftp)://|#)([^)]+)\)" + + def replace_link(match): + link_text = match.group(1) + relative_path = match.group(2) + + # Make relative to repo root + gh_file = (self.main_file.parent / relative_path).resolve() + gh_file = gh_file.relative_to(ROOT_DIR) + + # Make GitHub URL + url = "https://github.com/vllm-project/vllm/" + url += "tree/main" if self.path.is_dir() else "blob/main" + gh_url = f"{url}/{gh_file}" + + return f"[{link_text}]({gh_url})" + + return re.sub(link_pattern, replace_link, content) + + def generate(self) -> str: + content = f"# {self.title}\n\n" + url = "https://github.com/vllm-project/vllm/" + url += "tree/main" if self.path.is_dir() else "blob/main" + content += f"Source <{url}/{self.path.relative_to(ROOT_DIR)}>.\n\n" + + # Use long code fence to avoid issues with + # included files containing code fences too + code_fence = "``````" + + if self.is_code: + content += f'{code_fence}{self.main_file.suffix[1:]}\n--8<-- "{self.main_file}"\n{code_fence}\n' + else: + with open(self.main_file) as f: + # Skip the title from md snippets as it's been included above + main_content = f.readlines()[1:] + content += self.fix_relative_links("".join(main_content)) + content += "\n" + + if not self.other_files: + return content + + content += "## Example materials\n\n" + for file in sorted(self.other_files): + content += f'??? abstract "{file.relative_to(self.path)}"\n' + if file.suffix != ".md": + content += f" {code_fence}{file.suffix[1:]}\n" + content += f' --8<-- "{file}"\n' + if file.suffix != ".md": + content += f" {code_fence}\n" + + return content + + +def update_nav_file(examples: list[Example]): + """ + Update the .nav.yml file to include all generated examples. + This function completely regenerates the examples section based on the actual + folder structure, ensuring consistency between the examples folder and nav file. + + Args: + examples: List of Example objects that have been generated + """ + if not NAV_FILE.exists(): + logger.warning("Navigation file not found: %s", NAV_FILE) + return + + # Read the current nav file + with open(NAV_FILE, encoding="utf-8") as f: + nav_data = yaml.safe_load(f) or {} + + nav_list = nav_data.get("nav", []) + + # Find the "User Guide" section + user_guide_idx = None + examples_idx = None + for i, item in enumerate(nav_list): + if isinstance(item, dict) and "User Guide" in item: + user_guide_idx = i + user_guide_content = item["User Guide"] + # Find the "Examples" subsection + for j, subitem in enumerate(user_guide_content): + if isinstance(subitem, dict) and "Examples" in subitem: + examples_idx = j + break + break + + if user_guide_idx is None or examples_idx is None: + logger.warning("Could not find 'User Guide' -> 'Examples' section in nav file") + return + + # Get existing Examples section to preserve non-example items (like README.md) + existing_examples_content = nav_list[user_guide_idx]["User Guide"][examples_idx]["Examples"] + + # Preserve string items (like "examples/README.md") that are not example categories + preserved_items = [ + item + for item in existing_examples_content + if isinstance(item, str) and not item.startswith("user_guide/examples/") + ] + + # Group examples by category + examples_by_category = {} + for example in examples: + category = example.category + if category not in examples_by_category: + examples_by_category[category] = [] + examples_by_category[category].append(example) + + # Build the new Examples section - start with preserved items + examples_section = preserved_items.copy() + + # Add examples grouped by category, sorted by category name + for category in sorted(examples_by_category.keys()): + category_examples = sorted(examples_by_category[category], key=lambda e: e.path.stem) + category_items = [] + for example in category_examples: + doc_path = EXAMPLE_DOC_DIR / example.category / f"{example.path.stem}.md" + rel_path = doc_path.relative_to(ROOT_DIR / "docs") + category_items.append({example.title: str(rel_path)}) + + if category_items: + # Format category name (e.g., "offline_inference" -> "Offline Inference") + category_title = fix_case(category.replace("_", " ").title()) + examples_section.append({category_title: category_items}) + + # Update the nav structure + nav_list[user_guide_idx]["User Guide"][examples_idx]["Examples"] = examples_section + + # Write back to file + nav_data["nav"] = nav_list + with open(NAV_FILE, "w", encoding="utf-8") as f: + yaml.dump(nav_data, f, default_flow_style=False, sort_keys=False, allow_unicode=True) + logger.info("Updated navigation file: %s", NAV_FILE.relative_to(ROOT_DIR)) + + +def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool): + logger.info("Generating example documentation") + logger.debug("Root directory: %s", ROOT_DIR.resolve()) + logger.debug("Example directory: %s", EXAMPLE_DIR.resolve()) + logger.debug("Example document directory: %s", EXAMPLE_DOC_DIR.resolve()) + + # Create the EXAMPLE_DOC_DIR if it doesn't exist + if not EXAMPLE_DOC_DIR.exists(): + EXAMPLE_DOC_DIR.mkdir(parents=True) + + categories = sorted(p for p in EXAMPLE_DIR.iterdir() if p.is_dir()) + + examples = [] + glob_patterns = ["*.py", "*.md", "*.sh"] + # Find categorised examples + for category in categories: + globs = [category.glob(pattern) for pattern in glob_patterns] + for path in itertools.chain(*globs): + examples.append(Example(path, category.stem)) + # Find examples in subdirectories + for path in category.glob("*/*.md"): + examples.append(Example(path.parent, category.stem)) + + # Generate the example documentation + for example in sorted(examples, key=lambda e: e.path.stem): + example_name = f"{example.path.stem}.md" + doc_path = EXAMPLE_DOC_DIR / example.category / example_name + if not doc_path.parent.exists(): + doc_path.parent.mkdir(parents=True) + # Specify encoding for building on Windows + with open(doc_path, "w+", encoding="utf-8") as f: + f.write(example.generate()) + logger.debug("Example generated: %s", doc_path.relative_to(ROOT_DIR)) + + # Update the navigation file + update_nav_file(examples) diff --git a/docs/user_guide/examples/offline_inference/qwen2_5_omni.md b/docs/user_guide/examples/offline_inference/qwen2_5_omni.md index c11360bcfe..7ec3ed3afe 100644 --- a/docs/user_guide/examples/offline_inference/qwen2_5_omni.md +++ b/docs/user_guide/examples/offline_inference/qwen2_5_omni.md @@ -1,55 +1,40 @@ -# Offline Inference Example of vLLM-Omni for Qwen2.5-Omni +# Offline Example of vLLM-Omni for Qwen2.5-omni -Source <https://github.com/vllm-project/vllm-omni/tree/main/examples/offline_inference/qwen2_5_omni>. +Source <https://github.com/vllm-project/vllm/tree/main/examples/offline_inference/qwen2_5_omni>. ## đŸ› ī¸ Installation -Please refer to [installation](../../../getting_started/installation/README.md). - -## Run Offline inference with Qwen2.5-Omni. -First, navigate to the example folder -```bash -cd examples/offline_inference/qwen2_5_omni -``` -Inside the directory, `end2end.py` is a comprehensive demo suite for initializing a model instance of `Qwen/Qwen2.5-Omni-7B` and use it for various offline inference tasks. -??? abstract "end2end.py" - ``````py - --8<-- "examples/offline_inference/qwen2_5_omni/end2end.py" - `````` - -Below we also provide simple bash scripts to execute this file. -### Single Prompt -```bash -bash run_single_prompt.sh -``` -??? abstract "run_single_prompt.sh" - ``````sh - --8<-- "examples/offline_inference/qwen2_5_omni/run_single_prompt.sh" - `````` +Please refer to [README.md](https://github.com/vllm-project/vllm/tree/main/README.md) +## Run examples (Qwen2.5-omni) ### Multiple Prompts -Download dataset from [seed_tts](https://drive.google.com/file/d/1GlSjVfSHkW3-leKKBlfrjuuTGqQ_xaLP/edit) and extract the prompts with `extract_prompts.py` -??? abstract "extract_prompts.py" - ``````py - --8<-- "examples/offline_inference/qwen2_5_omni/extract_prompts.py" - `````` +Download dataset from [seed_tts](https://drive.google.com/file/d/1GlSjVfSHkW3-leKKBlfrjuuTGqQ_xaLP/edit). To get the prompt, you can: ```bash tar -xf <Your Download Path>/seedtts_testset.tar cp seedtts_testset/en/meta.lst examples/offline_inference/qwen2_5_omni/meta.lst python3 examples/offline_inference/qwen2_5_omni/extract_prompts.py \ --input examples/offline_inference/qwen2_5_omni/meta.lst \ --output examples/offline_inference/qwen2_5_omni/top100.txt \ - --topk 10 + --topk 100 +``` +Get into the example folder +```bash +cd examples/offline_inference/qwen2_5_omni ``` Then run the command below. ```bash bash run_multiple_prompts.sh ``` -??? abstract "run_multiple_prompts.sh" - ``````sh - --8<-- "examples/offline_inference/qwen2_5_omni/run_multiple_prompts.sh" - `````` +### Single Prompt +Get into the example folder +```bash +cd examples/offline_inference/qwen2_5_omni +``` +Then run the command below. +```bash +bash run_single_prompt.sh +``` ### FAQ @@ -58,3 +43,22 @@ If you encounter error about backend of librosa, try to install ffmpeg with comm sudo apt update sudo apt install ffmpeg ``` + +## Example materials + +??? abstract "end2end.py" + ``````py + --8<-- "/mnt/vllm_open_release/vllm-omni-cursor/vllm-omni/examples/offline_inference/qwen2_5_omni/end2end.py" + `````` +??? abstract "extract_prompts.py" + ``````py + --8<-- "/mnt/vllm_open_release/vllm-omni-cursor/vllm-omni/examples/offline_inference/qwen2_5_omni/extract_prompts.py" + `````` +??? abstract "run_multiple_prompts.sh" + ``````sh + --8<-- "/mnt/vllm_open_release/vllm-omni-cursor/vllm-omni/examples/offline_inference/qwen2_5_omni/run_multiple_prompts.sh" + `````` +??? abstract "run_single_prompt.sh" + ``````sh + --8<-- "/mnt/vllm_open_release/vllm-omni-cursor/vllm-omni/examples/offline_inference/qwen2_5_omni/run_single_prompt.sh" + `````` diff --git a/docs/user_guide/examples/offline_inference/qwen3_omni.md b/docs/user_guide/examples/offline_inference/qwen3_omni.md new file mode 100644 index 0000000000..58ea77b2b3 --- /dev/null +++ b/docs/user_guide/examples/offline_inference/qwen3_omni.md @@ -0,0 +1,64 @@ +# Offline Example of vLLM-Omni for Qwen3-omni + +Source <https://github.com/vllm-project/vllm/tree/main/examples/offline_inference/qwen3_omni>. + + +## đŸ› ī¸ Installation + +Please refer to [README.md](https://github.com/vllm-project/vllm/tree/main/README.md) + +## Run examples (Qwen3-omni) +### Multiple Prompts +Download dataset from [seed_tts](https://drive.google.com/file/d/1GlSjVfSHkW3-leKKBlfrjuuTGqQ_xaLP/edit). For processing dataset please refer to [Qwen2.5-omni README.md](https://github.com/vllm-project/vllm/tree/main/examples/offline_inference/qwen2_5_omni/README.md) +Get into the example folder +```bash +cd examples/offline_inference/qwen3_omni +``` +Then run the command below. +```bash +bash run_multiple_prompts.sh +``` +### Single Prompt +Get into the example folder +```bash +cd examples/offline_inference/qwen3_omni +``` +Then run the command below. +```bash +bash run_single_prompt.sh +``` +If you have not enough memory, you can set thinker with tensor parallel. Just run the command below. +```bash +bash run_single_prompt_tp.sh +``` + +### FAQ + +If you encounter error about backend of librosa, try to install ffmpeg with command below. +``` +sudo apt update +sudo apt install ffmpeg +``` + +## Example materials + +??? abstract "end2end.py" + ``````py + --8<-- "/mnt/vllm_open_release/vllm-omni-cursor/vllm-omni/examples/offline_inference/qwen3_omni/end2end.py" + `````` +??? abstract "qwen3_omni_moe_tp.yaml" + ``````yaml + --8<-- "/mnt/vllm_open_release/vllm-omni-cursor/vllm-omni/examples/offline_inference/qwen3_omni/qwen3_omni_moe_tp.yaml" + `````` +??? abstract "run_multiple_prompts.sh" + ``````sh + --8<-- "/mnt/vllm_open_release/vllm-omni-cursor/vllm-omni/examples/offline_inference/qwen3_omni/run_multiple_prompts.sh" + `````` +??? abstract "run_single_prompt.sh" + ``````sh + --8<-- "/mnt/vllm_open_release/vllm-omni-cursor/vllm-omni/examples/offline_inference/qwen3_omni/run_single_prompt.sh" + `````` +??? abstract "run_single_prompt_tp.sh" + ``````sh + --8<-- "/mnt/vllm_open_release/vllm-omni-cursor/vllm-omni/examples/offline_inference/qwen3_omni/run_single_prompt_tp.sh" + `````` diff --git a/docs/user_guide/examples/offline_inference/qwen_image.md b/docs/user_guide/examples/offline_inference/qwen_image.md index 904a2c0910..ce2c29b2a1 100644 --- a/docs/user_guide/examples/offline_inference/qwen_image.md +++ b/docs/user_guide/examples/offline_inference/qwen_image.md @@ -1,25 +1,15 @@ -# Offline Inference Example of vLLM-Omni for Qwen-Image +# Qwen-Image Offline Inference -Source <https://github.com/vllm-project/vllm-omni/tree/main/examples/offline_inference/qwen_image>. +Source <https://github.com/vllm-project/vllm/tree/main/examples/offline_inference/qwen_image>. -## đŸ› ī¸ Installation - -Please refer to [installation](../../../getting_started/installation/README.md). - -## Run Offline inference with Qwen-Image - -First, navigate to the example folder -```bash -cd examples/offline_inference/qwen_image -``` - This folder provides two simple entrypoints for experimenting with `Qwen/Qwen-Image` using vLLM-Omni: - `text_to_image.py`: command-line script for single image generation. -- `gradio_demo.py`: lightweight Gradio UI for interactive prompt/seed/CFG exploration. +- `web_demo.py`: lightweight Gradio UI for interactive prompt/seed/CFG exploration. -### Command-line Usage + +## Local CLI Usage ```bash python text_to_image.py \ @@ -43,14 +33,9 @@ Key arguments: - `--height/--width`: output resolution (defaults 1024x1024). - `--output`: path to save the generated PNG. -??? abstract "text_to_image.py" - ``````py - --8<-- "examples/offline_inference/qwen_image/text_to_image.py" - `````` - > â„šī¸ Qwen-Image currently publishes best-effort presets at `1328x1328`, `1664x928`, `928x1664`, `1472x1140`, `1140x1472`, `1584x1056`, and `1056x1584`. Adjust `--height/--width` accordingly for the most reliable outcomes. -### Web UI Demo +## Web UI Demo Launch the gradio demo: @@ -60,7 +45,13 @@ python gradio_demo.py --port 7862 Then open `http://localhost:7862/` on your local browser to interact with the web UI. +## Example materials + ??? abstract "gradio_demo.py" ``````py - --8<-- "examples/offline_inference/qwen_image/gradio_demo.py" + --8<-- "/mnt/vllm_open_release/vllm-omni-cursor/vllm-omni/examples/offline_inference/qwen_image/gradio_demo.py" + `````` +??? abstract "text_to_image.py" + ``````py + --8<-- "/mnt/vllm_open_release/vllm-omni-cursor/vllm-omni/examples/offline_inference/qwen_image/text_to_image.py" `````` diff --git a/docs/user_guide/examples/online_serving/qwen2_5_omni.md b/docs/user_guide/examples/online_serving/qwen2_5_omni.md index 96c283d168..7918c5f841 100644 --- a/docs/user_guide/examples/online_serving/qwen2_5_omni.md +++ b/docs/user_guide/examples/online_serving/qwen2_5_omni.md @@ -1,43 +1,38 @@ -# Online Serving Example of vLLM-Omni for Qwen2.5-Omni +# Online serving Example of vLLM-Omni for Qwen2.5-omni -Source <https://github.com/vllm-project/vllm-omni/blob/main/examples/online_serving/README.md>. +Source <https://github.com/vllm-project/vllm/tree/main/examples/online_serving/qwen2_5_omni>. ## đŸ› ī¸ Installation -Please refer to [installation](../../../getting_started/installation/README.md). +Please refer to [README.md](https://github.com/vllm-project/vllm/tree/main/examples/README.md) -## Deploy Qwen/Qwen2.5-Omni-7B +## Run examples (Qwen2.5-omni) -First launch the OpenAI-compatible inference server +Launch the server ```bash vllm serve Qwen/Qwen2.5-Omni-7B --omni --port 8091 ``` + If you have custom stage configs file, launch the server with command below ```bash vllm serve Qwen/Qwen2.5-Omni-7B --omni --port 8091 --stage-configs-path /path/to/stage_configs_file ``` -## Query the model -Navigate to the example folder + +Get into the example folder ```bash cd examples/online_serving ``` -Query the model server with OpenAI Python API client: + +Send request via python ```bash python openai_chat_completion_client_for_multimodal_generation.py --query-type mixed_modalities ``` -??? abstract "openai_chat_completion_client_for_multimodal_generation.py" - ``````py - --8<-- "examples/online_serving/openai_chat_completion_client_for_multimodal_generation.py" - `````` -You can also query the model with `curl` command: + +Send request via curl ```bash bash run_curl_multimodal_generation.sh mixed_modalities ``` -??? abstract "run_curl_multimodal_generation.sh" - ``````py - --8<-- "examples/online_serving/run_curl_multimodal_generation.sh" - `````` ### FAQ @@ -49,16 +44,17 @@ sudo apt install ffmpeg ## Run Local Web UI Demo -You can also deploy a Gradio Web UI that allows users to interact with the model through a web browser. Below is an example on how to do so with `Qwen/Qwen2.5-Omni-7B`. +This Web UI demo allows users to interact with the model through a web browser. ### Running Gradio Demo -Install gradio with `uv pip install "gradio>=5.49.1,<6.0.0"`, then you can launch the web service built on AsyncOmni by +Once vllm and vllm-omni are installed, you can launch the web service built on AsyncOmniLLM by ```bash python gradio_demo.py --model Qwen/Qwen2.5-Omni-7B --port 7861 ``` -Now you can interact with model via the web UI at `http://localhost:7861/` on your local browser. + +Then open `http://localhost:7861/` on your local browser to interact with the web UI. ### Options @@ -78,3 +74,18 @@ python gradio_demo.py \ - `--port`: Port for the Gradio server (default `7861`). - `--stage-configs-path`: Optional path to custom stage configs YAML. - `--share`: Set to expose a temporary public link via Gradio. + +## Example materials + +??? abstract "gradio_demo.py" + ``````py + --8<-- "/mnt/vllm_open_release/vllm-omni-cursor/vllm-omni/examples/online_serving/qwen2_5_omni/gradio_demo.py" + `````` +??? abstract "openai_chat_completion_client_for_multimodal_generation.py" + ``````py + --8<-- "/mnt/vllm_open_release/vllm-omni-cursor/vllm-omni/examples/online_serving/qwen2_5_omni/openai_chat_completion_client_for_multimodal_generation.py" + `````` +??? abstract "run_curl_multimodal_generation.sh" + ``````sh + --8<-- "/mnt/vllm_open_release/vllm-omni-cursor/vllm-omni/examples/online_serving/qwen2_5_omni/run_curl_multimodal_generation.sh" + `````` diff --git a/docs/user_guide/examples/online_serving/qwen3_omni.md b/docs/user_guide/examples/online_serving/qwen3_omni.md new file mode 100644 index 0000000000..3d1ba770bc --- /dev/null +++ b/docs/user_guide/examples/online_serving/qwen3_omni.md @@ -0,0 +1,95 @@ +# Online serving Example of vLLM-Omni for Qwen3-omni + +Source <https://github.com/vllm-project/vllm/tree/main/examples/online_serving/qwen3_omni>. + + +## đŸ› ī¸ Installation + +Please refer to [README.md](https://github.com/vllm-project/vllm/tree/main/examples/README.md) + +## Run examples (Qwen3-Omni) + +Launch the server +```bash +vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 +``` + +If you have custom stage configs file, launch the server with command below +```bash +vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 --stage-configs-path /path/to/stage_configs_file +``` + +Get into the example folder +```bash +cd examples/online_serving +``` + +Send request via python +```bash +python openai_chat_completion_client_for_multimodal_generation.py --query-type mixed_modalities +``` + +Send request via curl +```bash +bash run_curl_multimodal_generation.sh mixed_modalities +``` + +### FAQ + +If you encounter error about backend of librosa, try to install ffmpeg with command below. +``` +sudo apt update +sudo apt install ffmpeg +``` + +## Run Local Web UI Demo + +This Web UI demo allows users to interact with the model through a web browser. + +### Running Gradio Demo + +Once vllm and vllm-omni are installed, you can launch the web service built on AsyncOmniLLM by + +```bash +python gradio_demo.py --model Qwen/Qwen3-Omni-30B-A3B-Instruct --port 7861 +``` + +Then open `http://localhost:7861/` on your local browser to interact with the web UI. + + +### Options + +The gradio demo also supports running with an existing API server and can be customized with the following arguments. + + +```bash +python gradio_demo.py \ + --model Qwen/Qwen3-Omni-30B-A3B-Instruct \ + --use-api-server \ + --api-base http://localhost:8091/v1 \ + --ip 127.0.0.1 \ + --port 7861 +``` + +- `--model`: Model name +- `--use-api-server`: If set, connect to an existing vLLM HTTP API server instead of running AsyncOmniLLM locally. +- `--api-base`: Base URL for vllm serve (only used when `use-api-server` is set, default: http://localhost:8091/v1) +- `--ip`: Host/IP for Gradio server (default: 127.0.0.1) +- `--port`: Port for Gradio server (default: 7861) +- `--stage-configs-path`: Path to custom stage configs YAML file (optional) +- `--share`: Share the Gradio demo publicly (creates a public link) + +## Example materials + +??? abstract "gradio_demo.py" + ``````py + --8<-- "/mnt/vllm_open_release/vllm-omni-cursor/vllm-omni/examples/online_serving/qwen3_omni/gradio_demo.py" + `````` +??? abstract "openai_chat_completion_client_for_multimodal_generation.py" + ``````py + --8<-- "/mnt/vllm_open_release/vllm-omni-cursor/vllm-omni/examples/online_serving/qwen3_omni/openai_chat_completion_client_for_multimodal_generation.py" + `````` +??? abstract "run_curl_multimodal_generation.sh" + ``````sh + --8<-- "/mnt/vllm_open_release/vllm-omni-cursor/vllm-omni/examples/online_serving/qwen3_omni/run_curl_multimodal_generation.sh" + `````` diff --git a/examples/offline_inference/qwen2_5_omni/README.md b/examples/offline_inference/qwen2_5_omni/README.md index afdaee13ea..4c52925991 100644 --- a/examples/offline_inference/qwen2_5_omni/README.md +++ b/examples/offline_inference/qwen2_5_omni/README.md @@ -23,7 +23,7 @@ Then run the command below. ```bash bash run_multiple_prompts.sh ``` -### Single Prompts +### Single Prompt Get into the example folder ```bash cd examples/offline_inference/qwen2_5_omni diff --git a/examples/offline_inference/qwen2_5_omni/end2end.py b/examples/offline_inference/qwen2_5_omni/end2end.py index c5be18a94a..463db0aa89 100644 --- a/examples/offline_inference/qwen2_5_omni/end2end.py +++ b/examples/offline_inference/qwen2_5_omni/end2end.py @@ -16,7 +16,7 @@ from vllm.sampling_params import SamplingParams from vllm.utils import FlexibleArgumentParser -from vllm_omni.entrypoints.omni_llm import OmniLLM +from vllm_omni.entrypoints.omni import Omni SEED = 42 @@ -140,7 +140,7 @@ def main(args): model_name = "Qwen/Qwen2.5-Omni-7B" query_result = query_map[args.query_type]() - omni_llm = OmniLLM( + omni_llm = Omni( model=model_name, log_stats=args.enable_stats, log_file=("omni_llm_pipeline.log" if args.enable_stats else None), diff --git a/examples/offline_inference/qwen2_5_omni/run_single_prompt.sh b/examples/offline_inference/qwen2_5_omni/run_single_prompt.sh index 5b3c19cdc2..2f2b3ae756 100644 --- a/examples/offline_inference/qwen2_5_omni/run_single_prompt.sh +++ b/examples/offline_inference/qwen2_5_omni/run_single_prompt.sh @@ -1,2 +1,2 @@ python end2end.py --output-wav output_audio \ - --query-type use_audio_in_video + --query-type text diff --git a/examples/offline_inference/qwen3_omni/README.md b/examples/offline_inference/qwen3_omni/README.md index 766453667f..22c7651632 100644 --- a/examples/offline_inference/qwen3_omni/README.md +++ b/examples/offline_inference/qwen3_omni/README.md @@ -1,4 +1,4 @@ -# Offline Example of vLLM-Omni for Qwen2.5-omni +# Offline Example of vLLM-Omni for Qwen3-omni ## đŸ› ī¸ Installation @@ -15,7 +15,7 @@ Then run the command below. ```bash bash run_multiple_prompts.sh ``` -### Single Prompts +### Single Prompt Get into the example folder ```bash cd examples/offline_inference/qwen3_omni @@ -24,6 +24,10 @@ Then run the command below. ```bash bash run_single_prompt.sh ``` +If you have not enough memory, you can set thinker with tensor parallel. Just run the command below. +```bash +bash run_single_prompt_tp.sh +``` ### FAQ diff --git a/examples/offline_inference/qwen3_omni/end2end.py b/examples/offline_inference/qwen3_omni/end2end.py index c1fa21a621..54f74c6097 100644 --- a/examples/offline_inference/qwen3_omni/end2end.py +++ b/examples/offline_inference/qwen3_omni/end2end.py @@ -16,7 +16,7 @@ from vllm.multimodal.image import convert_image_mode from vllm.utils import FlexibleArgumentParser -from vllm_omni.entrypoints.omni_llm import OmniLLM +from vllm_omni.entrypoints.omni import Omni SEED = 42 @@ -127,8 +127,9 @@ def main(args): model_name = "Qwen/Qwen3-Omni-30B-A3B-Instruct" query_result = query_map[args.query_type]() - omni_llm = OmniLLM( + omni_llm = Omni( model=model_name, + stage_configs_path=args.stage_configs_path, ) thinker_sampling_params = SamplingParams( @@ -276,6 +277,12 @@ def parse_args(): default=None, help="Path to a .txt file with one prompt per line (preferred).", ) + parser.add_argument( + "--stage-configs-path", + type=str, + default=None, + help="Path to a stage configs file.", + ) return parser.parse_args() @@ -283,7 +290,3 @@ def parse_args(): if __name__ == "__main__": args = parse_args() main(args) - - # use examples: - # python end2end.py --model Qwen/Qwen3-Omni-30B-A3B-Instruct --query-type text - # python end2end.py --model /custom_path/Qwen3-Omni-30B-A3B-Instruct --query-type use_video_only diff --git a/examples/offline_inference/qwen3_omni/qwen3_omni_moe_tp.yaml b/examples/offline_inference/qwen3_omni/qwen3_omni_moe_tp.yaml new file mode 100644 index 0000000000..8d9c2900e6 --- /dev/null +++ b/examples/offline_inference/qwen3_omni/qwen3_omni_moe_tp.yaml @@ -0,0 +1,94 @@ +# Stage config for running Qwen3-Omni-MoE with 3-stage architecture +# Stage 0: Thinker (multimodal understanding + text generation) +# Stage 1: Talker (text embeddings → 8-layer RVQ codec codes) +# Stage 2: Code2Wav (8-layer RVQ codes → audio waveform) + +stage_args: + - stage_id: 0 + runtime: + devices: "0,1" + max_batch_size: 1 + engine_args: + model_stage: thinker + model_arch: Qwen3OmniMoeForConditionalGeneration + worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker + scheduler_cls: vllm_omni.core.sched.scheduler.OmniScheduler + gpu_memory_utilization: 0.8 + enforce_eager: true + trust_remote_code: true + engine_output_type: latent # Output hidden states for talker + distributed_executor_backend: "mp" + enable_prefix_caching: false + hf_config_name: thinker_config + tensor_parallel_size: 2 + final_output: true + final_output_type: text + is_comprehension: true + default_sampling_params: + temperature: 0.4 + top_p: 0.9 + top_k: 1 + max_tokens: 2048 + seed: 42 + detokenize: True + repetition_penalty: 1.05 + + - stage_id: 1 + runtime: + devices: "1" + max_batch_size: 1 + engine_args: + model_stage: talker + model_arch: Qwen3OmniMoeForConditionalGeneration + worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker + scheduler_cls: vllm_omni.core.sched.scheduler.OmniScheduler + gpu_memory_utilization: 0.5 + enforce_eager: true + trust_remote_code: true + engine_output_type: latent # Output codec codes for code2wav + # tensor_parallel_size: 2 + enable_prefix_caching: false + distributed_executor_backend: "mp" + hf_config_name: talker_config + engine_input_source: [0] + custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker + # final_output: true + # final_output_type: text + default_sampling_params: + temperature: 0.9 + top_k: 50 + max_tokens: 4096 + seed: 42 + detokenize: False + repetition_penalty: 1.05 + stop_token_ids: [2150] + + - stage_id: 2 + runtime: + devices: "1" + max_batch_size: 1 + engine_args: + model_stage: code2wav + model_arch: Qwen3OmniMoeForConditionalGeneration + worker_cls: vllm_omni.worker.gpu_generation_worker.GPUGenerationWorker + scheduler_cls: vllm_omni.core.sched.generation_scheduler.GenerationScheduler + enforce_eager: true + trust_remote_code: true + enable_prefix_caching: false + engine_output_type: audio # Final output: audio waveform + gpu_memory_utilization: 0.1 + distributed_executor_backend: "mp" + max_num_batched_tokens: 1000000 + hf_config_name: thinker_config + engine_input_source: [1] + custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav + final_output: true + final_output_type: audio + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 65536 + seed: 42 + detokenize: True + repetition_penalty: 1.1 diff --git a/examples/offline_inference/qwen3_omni/run_single_prompt_tp.sh b/examples/offline_inference/qwen3_omni/run_single_prompt_tp.sh new file mode 100644 index 0000000000..eb25b6bc05 --- /dev/null +++ b/examples/offline_inference/qwen3_omni/run_single_prompt_tp.sh @@ -0,0 +1,6 @@ +python end2end.py --output-wav output_audio \ + --query-type use_audio \ + --init-sleep-seconds 90 \ + --stage-configs-path qwen3_omni_moe_tp.yaml + +# init-sleep-seconds works to avoid two vLLM stages initialized at the same time within a card. diff --git a/examples/online_serving/qwen3_omni/README.md b/examples/online_serving/qwen3_omni/README.md index 01a0fdd286..6e4b27ea38 100644 --- a/examples/online_serving/qwen3_omni/README.md +++ b/examples/online_serving/qwen3_omni/README.md @@ -1,4 +1,4 @@ -# Online serving Example of vLLM-omni for Qwen2.5-omni +# Online serving Example of vLLM-Omni for Qwen3-omni ## đŸ› ī¸ Installation diff --git a/mkdocs.yml b/mkdocs.yml index dfa29f59bd..31ca350fa4 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -58,6 +58,7 @@ theme: hooks: - docs/mkdocs/hooks/generate_api_readme.py - docs/mkdocs/hooks/url_schemes.py + - docs/mkdocs/hooks/generate_examples.py # Plugins plugins: diff --git a/vllm_omni/__init__.py b/vllm_omni/__init__.py index 89c7cbfc6c..fd7f9174fb 100644 --- a/vllm_omni/__init__.py +++ b/vllm_omni/__init__.py @@ -24,7 +24,7 @@ from .entrypoints.async_omni_llm import AsyncOmniLLM # Main entry points -# from .entrypoints.omni import Omni +from .entrypoints.omni import Omni from .version import __version__, __version_tuple__ # isort:skip @@ -33,7 +33,7 @@ "__version__", "__version_tuple__", # Main components - # "Omni", + "Omni", "AsyncOmniLLM", # Configuration "OmniModelConfig", diff --git a/vllm_omni/diffusion/models/__init__.py b/vllm_omni/diffusion/models/__init__.py new file mode 100644 index 0000000000..9e7471a2da --- /dev/null +++ b/vllm_omni/diffusion/models/__init__.py @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Diffusion model implementations.""" diff --git a/vllm_omni/diffusion/models/qwen_image/__init__.py b/vllm_omni/diffusion/models/qwen_image/__init__.py index 69d6821f4f..84fa2259d4 100644 --- a/vllm_omni/diffusion/models/qwen_image/__init__.py +++ b/vllm_omni/diffusion/models/qwen_image/__init__.py @@ -1,3 +1,17 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Qwen Image model components.""" +"""Qwen Image diffusion model components.""" + +from vllm_omni.diffusion.models.qwen_image.pipeline_qwen_image import ( + QwenImagePipeline, + get_qwen_image_post_process_func, +) +from vllm_omni.diffusion.models.qwen_image.qwen_image_transformer import ( + QwenImageTransformer2DModel, +) + +__all__ = [ + "QwenImagePipeline", + "QwenImageTransformer2DModel", + "get_qwen_image_post_process_func", +] diff --git a/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image.py b/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image.py index cbedc5c317..69917361a1 100644 --- a/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image.py +++ b/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image.py @@ -392,8 +392,6 @@ def encode_prompt( Args: prompt (`str` or `list[str]`, *optional*): prompt to be encoded - device: (`torch.device`): - torch device num_images_per_prompt (`int`): number of images that should be generated per prompt prompt_embeds (`torch.Tensor`, *optional*): diff --git a/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py b/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py index c0ef5fd123..b2f4e35bdb 100644 --- a/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py +++ b/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py @@ -113,10 +113,12 @@ def __init__(self, theta: int, axes_dim: list[int], scale_rope=False): # DO NOT USING REGISTER BUFFER HERE, IT WILL CAUSE COMPLEX NUMBERS LOSE ITS IMAGINARY PART self.scale_rope = scale_rope - def rope_params(self, index, dim, theta=10000): + def rope_params(self, index: torch.Tensor, dim: int, theta: int = 10000): """ Args: - index: [0, 1, 2, 3] 1D Tensor representing the position index of the token + index (`torch.Tensor`): [0, 1, 2, 3] 1D Tensor representing the position index of the token + dim (`int`): Dimension for the rope parameters + theta (`int`): Theta parameter for rope """ assert dim % 2 == 0 freqs = torch.outer( diff --git a/vllm_omni/diffusion/worker/__init__.py b/vllm_omni/diffusion/worker/__init__.py index 068a25f8f9..dc3306dae3 100644 --- a/vllm_omni/diffusion/worker/__init__.py +++ b/vllm_omni/diffusion/worker/__init__.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Diffusion worker components.""" +"""Worker classes for diffusion models.""" from vllm_omni.diffusion.worker.gpu_worker import GPUWorker, WorkerProc diff --git a/vllm_omni/entrypoints/async_omni_llm.py b/vllm_omni/entrypoints/async_omni_llm.py index 5046196d05..c45acdc44a 100644 --- a/vllm_omni/entrypoints/async_omni_llm.py +++ b/vllm_omni/entrypoints/async_omni_llm.py @@ -94,7 +94,7 @@ def __init__( shm_threshold_bytes: int = 65536, batch_timeout: int = 10, init_timeout: int = 60000, - **kwargs, + **kwargs: Any, ): self.batch_timeout = batch_timeout self._enable_stats: bool = bool(log_stats) diff --git a/vllm_omni/entrypoints/omni_llm.py b/vllm_omni/entrypoints/omni_llm.py index 4b9b1b4f64..94d93ef767 100644 --- a/vllm_omni/entrypoints/omni_llm.py +++ b/vllm_omni/entrypoints/omni_llm.py @@ -534,8 +534,9 @@ def __init__( self, model: str, compilation_config: Optional[Union[int, dict[str, Any], CompilationConfig]] = None, + hf_overrides: Optional[dict[str, Any]] = None, structured_outputs_config: Optional[Union[dict[str, Any], StructuredOutputsConfig]] = None, - **kwargs, + **kwargs: Any, ): """LLM constructor.""" if "disable_log_stats" not in kwargs: diff --git a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_code_predictor_mtp.py b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_code_predictor_mtp.py index 49215be36d..5c5c5b15d1 100644 --- a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_code_predictor_mtp.py +++ b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_code_predictor_mtp.py @@ -6,6 +6,7 @@ autoregressively, predicting layers 1 to N based on layer-0 codes from the talker. """ +from collections import namedtuple from typing import Any, Optional import torch @@ -164,21 +165,48 @@ def forward( from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS - attention_interface = ALL_ATTENTION_FUNCTIONS["flash_attention_2"] - attn_output, _ = attention_interface( - self, - q_heads, - k_heads, - v_heads, - None, - dropout=0.0 if not self.training else self.attention_dropout, - scaling=self.head_dim**-0.5, - sliding_window=None, - use_cache=use_cache, - position_ids=position_ids, - output_hidden_states=True, - output_attentions=False, - ) + # Try attention backends in order of preference, with runtime error handling + # This handles cases where the backend is registered but not actually available + attention_backends = ["flash_attention_2", "xformers", "eager", "sdpa"] + attn_output = None + last_error = None + + for backend_name in attention_backends: + if backend_name not in ALL_ATTENTION_FUNCTIONS: + continue + + try: + attention_interface = ALL_ATTENTION_FUNCTIONS[backend_name] + attn_output, _ = attention_interface( + self, + q_heads, + k_heads, + v_heads, + None, + dropout=0.0 if not self.training else getattr(self, "attention_dropout", 0.0), + scaling=self.head_dim**-0.5, + sliding_window=None, + use_cache=use_cache, + position_ids=position_ids, + output_hidden_states=True, + output_attentions=False, + ) + # Success - log fallback if not using flash_attention_2 + if backend_name != "flash_attention_2": + logger.warning_once( + f"Using {backend_name} attention backend (flash_attention_2 not available or failed)" + ) + break + except (ValueError, ImportError, RuntimeError, AttributeError) as e: + # Store error and try next backend + last_error = e + continue + + if attn_output is None: + raise RuntimeError( + f"All attention backends failed. Last error: {last_error}. " + "Please install flash-attn, or ensure PyTorch's scaled_dot_product_attention is available." + ) attn_output = attn_output.reshape(*(hidden_states.shape[:-1]), -1).contiguous() attn_output = self.o_proj(attn_output) @@ -373,16 +401,22 @@ def forward( past_key_values: Optional[Any] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - **kwargs, - ): + **kwargs: Any, + ) -> Any: """ Forward pass matching HF structure. Args: inputs_embeds: [batch, seq_len, hidden_size] + attention_mask: Optional attention mask tensor + position_ids: Optional position IDs tensor + past_key_values: Optional cached key-value pairs + use_cache: Whether to use cache + cache_position: Optional cache position tensor + **kwargs: Additional keyword arguments Returns: - Object with .last_hidden_state attribute + Named tuple with .last_hidden_state and .past_key_values attributes """ batch_size, seq_len, _ = inputs_embeds.shape @@ -421,8 +455,6 @@ def forward( hidden_states = self.norm(hidden_states) # Return in HF-compatible format - from collections import namedtuple - Output = namedtuple("Output", ["last_hidden_state", "past_key_values"]) return Output(last_hidden_state=hidden_states, past_key_values=None) # [batch, num_code_groups-1, hidden_size] diff --git a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_talker.py b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_talker.py index 34e05925f9..91231d0fbc 100644 --- a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_talker.py +++ b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_talker.py @@ -1,4 +1,5 @@ from collections.abc import Iterable +from typing import Any import torch import torch.nn as nn @@ -284,7 +285,7 @@ def code_predictor_forward( return result_codes, summed_embeddings - def init_multi_modal(self, thinker_config): + def init_multi_modal(self, thinker_config: Any) -> None: """ Initialize multimodal components from the thinker. diff --git a/vllm_omni/model_executor/stage_input_processors/qwen3_omni.py b/vllm_omni/model_executor/stage_input_processors/qwen3_omni.py index e2d07aa1bb..b3b5500ef1 100644 --- a/vllm_omni/model_executor/stage_input_processors/qwen3_omni.py +++ b/vllm_omni/model_executor/stage_input_processors/qwen3_omni.py @@ -3,7 +3,7 @@ # Copyright 2025 The Qwen team. """Stage input processor for Qwen3 Omni MoE: Thinker → Talker transition.""" -from typing import Union +from typing import Any, Union import torch from vllm.inputs import TextPrompt @@ -47,11 +47,11 @@ def _compute_talker_prompt_ids_length(info): def thinker2talker( - stage_list, - engine_input_source, - prompt: Union[OmniTokensPrompt, TextPrompt] = None, + stage_list: list[Any], + engine_input_source: list[int], + prompt: Union[OmniTokensPrompt, TextPrompt, None] = None, requires_multimodal_data: bool = False, -): +) -> list[OmniTokensPrompt]: """ Process thinker outputs to create talker inputs. @@ -64,6 +64,7 @@ def thinker2talker( stage_list: List of stage objects engine_input_source: Source stage IDs (typically [0] for thinker) prompt: Original prompt data + requires_multimodal_data: Whether multimodal data is required Returns: List of OmniTokensPrompt for talker stage @@ -111,11 +112,11 @@ def thinker2talker( def talker2code2wav( - stage_list, - engine_input_source, - prompt: Union[OmniTokensPrompt, TextPrompt] = None, + stage_list: list[Any], + engine_input_source: list[int], + prompt: Union[OmniTokensPrompt, TextPrompt, None] = None, requires_multimodal_data: bool = False, -): +) -> list[OmniTokensPrompt]: """ Process talker outputs to create code2wav inputs. @@ -128,6 +129,7 @@ def talker2code2wav( stage_list: List of stage objects engine_input_source: Source stage IDs (typically [1] for talker) prompt: Original prompt data + requires_multimodal_data: Whether multimodal data is required Returns: List of OmniTokensPrompt for code2wav stage