vllm-project · Gaohan123 · Nov 30, 2025 · Nov 30, 2025 · Nov 30, 2025
diff --git a/docs/api/README.md b/docs/api/README.md
@@ -4,8 +4,8 @@
 
 Main entry points for vLLM-Omni inference and serving.
 
-- [vllm_omni.entrypoints.async_omni_llm.AsyncOmniLLM][]
-- [vllm_omni.entrypoints.async_omni_llm.AsyncOmniStageLLM][]
+- [vllm_omni.entrypoints.async_omni.AsyncOmni][]
+- [vllm_omni.entrypoints.async_omni.AsyncOmniStageLLM][]
 - [vllm_omni.entrypoints.chat_utils.OmniAsyncMultiModalContentParser][]
 - [vllm_omni.entrypoints.chat_utils.OmniAsyncMultiModalItemTracker][]
 - [vllm_omni.entrypoints.chat_utils.parse_chat_messages_futures][]
@@ -102,10 +102,6 @@ Configuration classes.
 
 Worker classes and model runners for distributed inference.
 
-- [vllm_omni.diffusion.worker.gpu_worker.GPUWorker][]
-- [vllm_omni.diffusion.worker.gpu_worker.WorkerProc][]
-- [vllm_omni.diffusion.worker.npu.npu_worker.NPUWorker][]
-- [vllm_omni.diffusion.worker.npu.npu_worker.NPUWorkerProc][]
 - [vllm_omni.worker.gpu_ar_model_runner.GPUARModelRunner][]
 - [vllm_omni.worker.gpu_ar_worker.GPUARWorker][]
 - [vllm_omni.worker.gpu_diffusion_model_runner.GPUDiffusionModelRunner][]

diff --git a/docs/configuration/stage_configs.md b/docs/configuration/stage_configs.md
@@ -44,7 +44,7 @@ stage_args:
       trust_remote_code: true # Needed by huggingface config parsing
       engine_output_type: latent  # It claims that the stage will input latent hiddenstates besides token ids
       enable_prefix_caching: false # For request with hiddenstates output, the prefix caching is not supported now
-    is_comprehension: true # If the stage is a text or multimodal comprehension module. If it is, the AsyncOmniLLM will use its tokenizer as default
+    is_comprehension: true # If the stage is a text or multimodal comprehension module. If it is, the AsyncOmni will use its tokenizer as default
     final_output: true # If the stage has output as part of final outputs. If it is false, which means that the stage only works as a intermediate role.
     final_output_type: text # What is the final output type. It can be text and audio now.
     default_sampling_params: # sampling parameters for the stage. Their meaning aligns with vLLM.
@@ -206,7 +206,7 @@ Default: `false`
 
 ### `is_comprehension`
 
-Whether this stage is a text or multimodal comprehension module. When set to `true`, the stage acts as a comprehension module that processes input text or multimodal content. If this is the first comprehension stage, `AsyncOmniLLM` will use its tokenizer as the default tokenizer for the entire pipeline.
+Whether this stage is a text or multimodal comprehension module. When set to `true`, the stage acts as a comprehension module that processes input text or multimodal content. If this is the first comprehension stage, `AsyncOmni` will use its tokenizer as the default tokenizer for the entire pipeline.
 
 Default: `true`
 

diff --git a/docs/design/vllm_omni_design.md b/docs/design/vllm_omni_design.md
@@ -14,7 +14,7 @@ vLLM-Omni is a multi-modality extension for vLLM that supports non-autoregressiv
 
 ## Key Data Flow
 
-API Server --> OmniLLM/AsyncOmniLLM (New, including multi engines) --> LLMEngine/AsyncLLM --> Engine Core
+API Server --> OmniLLM/AsyncOmni (New, including multi engines) --> LLMEngine/AsyncLLM --> Engine Core
  --> Scheduler (New one for DiT) --> Executor (New one for diffusers) --> Worker (New one for DiT)
  --> ModelRunner (New one for AR hiddenstate, New one for DiT) --> RequestState --> OutputProcessoer (New one for final multimodal output)
 
@@ -40,7 +40,7 @@ graph TD
     B --> C{Detect --omni flag}
     C -->|Yes| D[Parse OmniConfig]
     C -->|No| E[Forward to vLLM CLI]
-    D --> F[Initialize AsyncOmniLLM]
+    D --> F[Initialize AsyncOmni]
     F --> G[Start omni Server]
     G --> H[Multi-stage Processing]
     E --> I[Standard vLLM Pipeline]
@@ -209,7 +209,7 @@ Similar to OmniLLM in offline inference, add some asynchronous processing, refer
 from vllm.v1.engine.async_llm import AsyncLLM
 
 
-class AsyncOmniLLM(AsyncLLM):
+class AsyncOmni(AsyncLLM):
     """Extended AsyncLLM supporting multiple engines and stage-based processing"""
 
     def __init__(self, stage_configs: List[StageConfig]):

diff --git a/docs/mkdocs/hooks/generate_api_readme.py b/docs/mkdocs/hooks/generate_api_readme.py
@@ -188,6 +188,11 @@ def scan_package(package_name: str = "vllm_omni") -> dict[str, list[str]]:
             # Add classes (filter out internal ones)
             for class_name in classes:
                 class_short_name = class_name.split(".")[-1]
+
+                # TODO: Implement worker API reference
+                if "diffusion.worker.gpu_worker" in class_name:
+                    continue
+
                 # Skip if it matches internal patterns (unless it's a main model class)
                 if any(pattern in class_short_name for pattern in internal_patterns):
                     # But include main model classes

diff --git a/docs/user_guide/examples/online_serving/qwen2_5_omni.md b/docs/user_guide/examples/online_serving/qwen2_5_omni.md
@@ -48,7 +48,7 @@ This Web UI demo allows users to interact with the model through a web browser.
 
 ### Running Gradio Demo
 
-Once vllm and vllm-omni are installed, you can launch the web service built on AsyncOmniLLM by
+Once vllm and vllm-omni are installed, you can launch the web service built on AsyncOmni by
 
 ```bash
 python gradio_demo.py  --model Qwen/Qwen2.5-Omni-7B --port 7861

diff --git a/docs/user_guide/examples/online_serving/qwen3_omni.md b/docs/user_guide/examples/online_serving/qwen3_omni.md
@@ -48,7 +48,7 @@ This Web UI demo allows users to interact with the model through a web browser.
 
 ### Running Gradio Demo
 
-Once vllm and vllm-omni are installed, you can launch the web service built on AsyncOmniLLM by
+Once vllm and vllm-omni are installed, you can launch the web service built on AsyncOmni by
 
 ```bash
 python gradio_demo.py  --model Qwen/Qwen3-Omni-30B-A3B-Instruct --port 7861
@@ -72,7 +72,7 @@ python gradio_demo.py \
 ```
 
 - `--model`: Model name
-- `--use-api-server`: If set, connect to an existing vLLM HTTP API server instead of running AsyncOmniLLM locally.
+- `--use-api-server`: If set, connect to an existing vLLM HTTP API server instead of running AsyncOmni locally.
 - `--api-base`: Base URL for vllm serve (only used when `use-api-server` is set, default: http://localhost:8091/v1)
 - `--ip`: Host/IP for Gradio server (default: 127.0.0.1)
 - `--port`: Port for Gradio server (default: 7861)

@@ -45,7 +45,7 @@ This Web UI demo allows users to interact with the model through a web browser.
 
 ### Running Gradio Demo
 
-Once vllm and vllm-omni are installed, you can launch the web service built on AsyncOmniLLM by
+Once vllm and vllm-omni are installed, you can launch the web service built on AsyncOmni by
 
 ```bash
 python gradio_demo.py  --model Qwen/Qwen2.5-Omni-7B --port 7861

@@ -15,7 +15,7 @@
 from vllm.assets.video import video_get_metadata, video_to_ndarrays
 from vllm.sampling_params import SamplingParams
 
-from vllm_omni.entrypoints.async_omni_llm import AsyncOmniLLM
+from vllm_omni.entrypoints.async_omni import AsyncOmni
 
 # Import utils from offline inference example
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../offline_inference/qwen2_5_omni"))
@@ -236,8 +236,8 @@ def process_video_file(
     return frames, metadata, audio_tuple
 
 
-async def run_inference_async_omni_llm(
-    omni_llm: AsyncOmniLLM,
+async def run_inference_async_omni(
+    omni: AsyncOmni,
     sampling_params: list[SamplingParams],
     prompt_args_template: SimpleNamespace,
     user_prompt: str,
@@ -246,7 +246,7 @@ async def run_inference_async_omni_llm(
     video_file: Optional[str] = None,
     use_audio_in_video: bool = False,
 ):
-    """Run inference using AsyncOmniLLM directly with multimodal support."""
+    """Run inference using AsyncOmni directly with multimodal support."""
     if not user_prompt.strip() and not audio_file and not image_file and not video_file:
         return "Please provide at least a text prompt or multimodal input.", None
 
@@ -328,7 +328,7 @@ async def run_inference_async_omni_llm(
         text_outputs: list[str] = []
         audio_output = None
 
-        async for stage_outputs in omni_llm.generate(
+        async for stage_outputs in omni.generate(
             prompt=omni_prompt,
             request_id=request_id,
             sampling_params_list=sampling_params,
@@ -362,12 +362,12 @@ async def run_inference_async_omni_llm(
 
 
 def build_interface(
-    omni_llm: AsyncOmniLLM,
+    omni: AsyncOmni,
     sampling_params: list[SamplingParams],
     prompt_args_template: SimpleNamespace,
     model: str,
 ):
-    """Build Gradio interface for AsyncOmniLLM mode."""
+    """Build Gradio interface for AsyncOmni mode."""
 
     async def run_inference(
         user_prompt: str,
@@ -376,8 +376,8 @@ async def run_inference(
         video_file: Optional[str],
         use_audio_in_video: bool,
     ):
-        return await run_inference_async_omni_llm(
-            omni_llm,
+        return await run_inference_async_omni(
+            omni,
             sampling_params,
             prompt_args_template,
             user_prompt,
@@ -475,7 +475,7 @@ async def run_inference(
 
 def main():
     args = parse_args()
-    omni_llm = None
+    omni = None
 
     model_name = "/".join(args.model.split("/")[-2:])
     assert model_name in SUPPORTED_MODELS, (
@@ -485,31 +485,31 @@ def main():
     # Register signal handlers for graceful shutdown
     def signal_handler(sig, frame):
         print("\nReceived interrupt signal, shutting down...")
-        if omni_llm is not None:
+        if omni is not None:
             try:
-                omni_llm.shutdown()
+                omni.shutdown()
             except Exception as e:
                 print(f"Error during shutdown: {e}")
         sys.exit(0)
 
     signal.signal(signal.SIGINT, signal_handler)
     signal.signal(signal.SIGTERM, signal_handler)
 
-    print(f"Initializing AsyncOmniLLM with model: {args.model}")
+    print(f"Initializing AsyncOmni with model: {args.model}")
     if args.stage_configs_path:
         print(f"Using custom stage configs: {args.stage_configs_path}")
 
     sampling_params = build_sampling_params(SEED, model_name)
-    omni_llm = AsyncOmniLLM(
+    omni = AsyncOmni(
         model=args.model,
         stage_configs_path=args.stage_configs_path,
         init_timeout=ASYNC_INIT_TIMEOUT,
     )
-    print("✓ AsyncOmniLLM initialized successfully")
+    print("✓ AsyncOmni initialized successfully")
     prompt_args_template = create_prompt_args(args)
 
     demo = build_interface(
-        omni_llm,
+        omni,
         sampling_params,
         prompt_args_template,
         args.model,
@@ -524,9 +524,9 @@ def signal_handler(sig, frame):
         print("\nShutting down...")
     finally:
         # Cleanup
-        if omni_llm is not None:
+        if omni is not None:
             try:
-                omni_llm.shutdown()
+                omni.shutdown()
             except Exception as e:
                 print(f"Error during cleanup: {e}")
 

@@ -45,7 +45,7 @@ This Web UI demo allows users to interact with the model through a web browser.
 
 ### Running Gradio Demo
 
-Once vllm and vllm-omni are installed, you can launch the web service built on AsyncOmniLLM by
+Once vllm and vllm-omni are installed, you can launch the web service built on AsyncOmni by
 
 ```bash
 python gradio_demo.py  --model Qwen/Qwen3-Omni-30B-A3B-Instruct --port 7861

@@ -15,7 +15,7 @@
 from vllm.assets.video import video_get_metadata, video_to_ndarrays
 from vllm.sampling_params import SamplingParams
 
-from vllm_omni.entrypoints.async_omni_llm import AsyncOmniLLM
+from vllm_omni.entrypoints.async_omni import AsyncOmni
 
 # Import utils from offline inference example
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../offline_inference/qwen3_omni"))
@@ -239,8 +239,8 @@ def process_video_file(
     return frames, metadata, audio_tuple
 
 
-async def run_inference_async_omni_llm(
-    omni_llm: AsyncOmniLLM,
+async def run_inference_async_omni(
+    omni: AsyncOmni,
     sampling_params: list[SamplingParams],
     prompt_args_template: SimpleNamespace,
     user_prompt: str,
@@ -249,7 +249,7 @@ async def run_inference_async_omni_llm(
     video_file: Optional[str] = None,
     use_audio_in_video: bool = False,
 ):
-    """Run inference using AsyncOmniLLM directly with multimodal support."""
+    """Run inference using AsyncOmni directly with multimodal support."""
     if not user_prompt.strip() and not audio_file and not image_file and not video_file:
         return "Please provide at least a text prompt or multimodal input.", None
 
@@ -331,7 +331,7 @@ async def run_inference_async_omni_llm(
         text_outputs: list[str] = []
         audio_output = None
 
-        async for stage_outputs in omni_llm.generate(
+        async for stage_outputs in omni.generate(
             prompt=omni_prompt,
             request_id=request_id,
             sampling_params_list=sampling_params,
@@ -366,12 +366,12 @@ async def run_inference_async_omni_llm(
 
 
 def build_interface(
-    omni_llm: AsyncOmniLLM,
+    omni: AsyncOmni,
     sampling_params: list[SamplingParams],
     prompt_args_template: SimpleNamespace,
     model: str,
 ):
-    """Build Gradio interface for AsyncOmniLLM mode."""
+    """Build Gradio interface for AsyncOmni mode."""
 
     async def run_inference(
         user_prompt: str,
@@ -380,8 +380,8 @@ async def run_inference(
         video_file: Optional[str],
         use_audio_in_video: bool,
     ):
-        return await run_inference_async_omni_llm(
-            omni_llm,
+        return await run_inference_async_omni(
+            omni,
             sampling_params,
             prompt_args_template,
             user_prompt,
@@ -479,7 +479,7 @@ async def run_inference(
 
 def main():
     args = parse_args()
-    omni_llm = None
+    omni = None
 
     model_name = "/".join(args.model.split("/")[-2:])
     assert model_name in SUPPORTED_MODELS, (
@@ -489,31 +489,31 @@ def main():
     # Register signal handlers for graceful shutdown
     def signal_handler(sig, frame):
         print("\nReceived interrupt signal, shutting down...")
-        if omni_llm is not None:
+        if omni is not None:
             try:
-                omni_llm.shutdown()
+                omni.shutdown()
             except Exception as e:
                 print(f"Error during shutdown: {e}")
         sys.exit(0)
 
     signal.signal(signal.SIGINT, signal_handler)
     signal.signal(signal.SIGTERM, signal_handler)
 
-    print(f"Initializing AsyncOmniLLM with model: {args.model}")
+    print(f"Initializing AsyncOmni with model: {args.model}")
     if args.stage_configs_path:
         print(f"Using custom stage configs: {args.stage_configs_path}")
 
     sampling_params = build_sampling_params(SEED, model_name)
-    omni_llm = AsyncOmniLLM(
+    omni = AsyncOmni(
         model=args.model,
         stage_configs_path=args.stage_configs_path,
         init_timeout=ASYNC_INIT_TIMEOUT,
     )
-    print("✓ AsyncOmniLLM initialized successfully")
+    print("✓ AsyncOmni initialized successfully")
     prompt_args_template = create_prompt_args(args)
 
     demo = build_interface(
-        omni_llm,
+        omni,
         sampling_params,
         prompt_args_template,
         args.model,
@@ -528,9 +528,9 @@ def signal_handler(sig, frame):
         print("\nShutting down...")
     finally:
         # Cleanup
-        if omni_llm is not None:
+        if omni is not None:
             try:
-                omni_llm.shutdown()
+                omni.shutdown()
             except Exception as e:
                 print(f"Error during cleanup: {e}")
 

@@ -86,6 +86,7 @@ plugins:
       api_root_uri: "api"
       nav_item_prefix: ""  # No prefix in navigation tree (clean names)
       show_full_namespace: false  # Show only module name, not full path
+      on_implicit_namespace_package: skip  # Skip implicit namespace packages without warning
       exclude:
         - "re:vllm_omni\\._.*"  # Internal modules
         - "vllm_omni.diffusion.models.qwen_image"  # avoid importing vllm in mkdocs building