Merge branch 'main' into add-bagel-example-scripts

nussejzz · web-flow · commit 9de79f89abcf · 2026-01-30T00:27:16.000+08:00
diff --git a/vllm_omni/diffusion/offload.py b/vllm_omni/diffusion/offload.py
@@ -18,6 +18,8 @@
 from torch import nn
 from vllm.logger import init_logger
 
+from vllm_omni.platforms import current_omni_platform
+
 if TYPE_CHECKING:
     from vllm_omni.diffusion.data import OmniDiffusionConfig
 
@@ -63,8 +65,8 @@ def _to_cpu(self, module: nn.Module) -> None:
         module.to("cpu", non_blocking=True)
 
         # Release allocator blocks when tensors leave the GPU.
-        if previous_device.type == "cuda" and torch.cuda.is_available():
-            torch.cuda.empty_cache()
+        if previous_device.type != "cpu":
+            current_omni_platform.empty_cache()
 
         if self.pin_memory:
             for p in module.parameters():
@@ -87,15 +89,19 @@ def _dit_pre_hook(self, module: nn.Module, args: tuple) -> None:
         for enc in self.encoders:
             self._to_cpu(enc)
         self._to_gpu(module)
-        torch.cuda.synchronize()
+
+        current_omni_platform.synchronize()
+
         logger.debug("Swapped: encoders -> CPU, DiT -> GPU")
 
     def _encoder_pre_hook(self, module: nn.Module, args: tuple) -> None:
         """Before encoder forward: offload DiT, load encoder."""
         for dit_mod in self.dits:
             self._to_cpu(dit_mod)
         self._to_gpu(module)
-        torch.cuda.synchronize()
+
+        current_omni_platform.synchronize()
+
         logger.debug("Swapped: DiT -> CPU, encoder -> GPU")
 
     def register(self) -> None:
@@ -166,7 +172,10 @@ def apply_offload_hooks(
         try:
             device = next(dit_modules[0].parameters()).device
         except StopIteration:
-            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+            try:
+                device = current_omni_platform.get_torch_device()
+            except (NotImplementedError, AttributeError):
+                device = torch.device("cpu")
 
     # Collect all encoders
     encoders: list[nn.Module] = []
@@ -184,9 +193,10 @@ def apply_offload_hooks(
     pin = getattr(od_config, "pin_cpu_memory", True)
     for dit_mod in dit_modules:
         dit_mod.to("cpu")
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
-    if pin and torch.cuda.is_available():
+
+    current_omni_platform.empty_cache()
+
+    if pin:
         for dit_mod in dit_modules:
             for p in dit_mod.parameters():
                 if p.data.device.type == "cpu" and not p.data.is_pinned():
diff --git a/vllm_omni/model_executor/models/qwen2_5_omni/qwen2_5_omni_token2wav.py b/vllm_omni/model_executor/models/qwen2_5_omni/qwen2_5_omni_token2wav.py
@@ -736,6 +736,10 @@ def kaiser_sinc_filter1d(cutoff: float, half_width: float, kernel_size: int) ->
         kaiser_window = torch.kaiser_window(
             kernel_size, beta=beta, periodic=False, dtype=torch.float32, device="cpu"
         ).to("npu")
+    elif current_omni_platform.is_xpu():
+        kaiser_window = torch.kaiser_window(
+            kernel_size, beta=beta, periodic=False, dtype=torch.float32, device="cpu"
+        ).to("xpu")
     else:
         kaiser_window = torch.kaiser_window(kernel_size, beta=beta, periodic=False, dtype=torch.float32)
 
diff --git a/vllm_omni/platforms/interface.py b/vllm_omni/platforms/interface.py
@@ -31,6 +31,15 @@ class OmniPlatform(Platform):
     def is_npu(self) -> bool:
         return self._omni_enum == OmniPlatformEnum.NPU
 
+    def is_xpu(self) -> bool:
+        return self._omni_enum == OmniPlatformEnum.XPU
+
+    def is_cuda(self) -> bool:
+        return self._omni_enum == OmniPlatformEnum.CUDA
+
+    def is_rocm(self) -> bool:
+        return self._omni_enum == OmniPlatformEnum.ROCM
+
     @classmethod
     def get_omni_ar_worker_cls(cls) -> str:
         raise NotImplementedError
diff --git a/vllm_omni/platforms/xpu/stage_configs/qwen2_5_omni.yaml b/vllm_omni/platforms/xpu/stage_configs/qwen2_5_omni.yaml
@@ -0,0 +1,101 @@
+# stage config for running qwen2.5-omni with architecture of OmniLLM.
+
+# The following config has been verified on 2x 1550-64G XPUs.
+stage_args:
+  - stage_id: 0
+    stage_type: llm # Use llm stage type to launch OmniLLM
+    runtime:
+      process: true # Run this stage in a separate process
+      devices: "0" # Visible devices for this stage
+      max_batch_size: 1
+    engine_args:
+      model_stage: thinker
+      model_arch: Qwen2_5OmniForConditionalGeneration
+      worker_type: ar
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      gpu_memory_utilization: 0.8
+      enforce_eager: false
+      trust_remote_code: true
+      engine_output_type: latent
+      enable_prefix_caching: false
+    is_comprehension: true
+    final_output: true
+    final_output_type: text
+    default_sampling_params:
+      temperature: 0.0
+      top_p: 1.0
+      top_k: -1
+      max_tokens: 2048
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.1
+  - stage_id: 1
+    stage_type: llm # Use llm stage type to launch OmniLLM
+    runtime:
+      process: true
+      devices: "1"
+      max_batch_size: 1
+    engine_args:
+      model_stage: talker
+      model_arch: Qwen2_5OmniForConditionalGeneration
+      worker_type: ar
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      gpu_memory_utilization: 0.8
+      enforce_eager: false
+      trust_remote_code: true
+      enable_prefix_caching: false
+      engine_output_type: latent
+    engine_input_source: [0]
+    custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen2_5_omni.thinker2talker
+    default_sampling_params:
+      temperature: 0.9
+      top_p: 0.8
+      top_k: 40
+      max_tokens: 2048
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.05
+      stop_token_ids: [8294]
+
+  - stage_id: 2
+    stage_type: llm # Use llm stage type to launch OmniLLM
+    runtime:
+      process: true
+      devices: "0"
+      max_batch_size: 1
+    engine_args:
+      model_stage: code2wav
+      model_arch: Qwen2_5OmniForConditionalGeneration
+      worker_type: generation
+      scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
+      gpu_memory_utilization: 0.15
+      enforce_eager: true
+      trust_remote_code: true
+      enable_prefix_caching: false
+      engine_output_type: audio
+    engine_input_source: [1]
+    final_output: true
+    final_output_type: audio
+    default_sampling_params:
+      temperature: 0.0
+      top_p: 1.0
+      top_k: -1
+      max_tokens: 2048
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.1
+
+# Top-level runtime config (concise): default windows and stage edges
+runtime:
+  enabled: true
+  defaults:
+    window_size: -1 # Simplified: trigger downstream only after full upstream completion
+    max_inflight: 1 # Simplified: process serially within each stage
+
+  edges:
+    - from: 0 # thinker → talker: trigger only after receiving full input (-1)
+      to: 1
+      window_size: -1
+    - from: 1 # talker → code2wav: trigger only after receiving full input (-1)
+      to: 2
+      window_size: -1
diff --git a/vllm_omni/platforms/xpu/stage_configs/qwen3_omni_moe.yaml b/vllm_omni/platforms/xpu/stage_configs/qwen3_omni_moe.yaml
@@ -0,0 +1,99 @@
+# Stage config for running Qwen3-Omni-MoE with 3-stage architecture
+# Stage 0: Thinker (multimodal understanding + text generation)
+# Stage 1: Talker (text embeddings → 8-layer RVQ codec codes)
+# Stage 2: Code2Wav (8-layer RVQ codes → audio waveform)
+
+# The following config has been verified on 4x 1550-64G XPUs.
+stage_args:
+  - stage_id: 0
+    stage_type: llm # Use llm stage type to launch OmniLLM
+    runtime:
+      devices: "0,1"
+      max_batch_size: 1
+    engine_args:
+      model_stage: thinker
+      model_arch: Qwen3OmniMoeForConditionalGeneration
+      worker_type: ar
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      gpu_memory_utilization: 0.8
+      enforce_eager: true
+      trust_remote_code: true
+      engine_output_type: latent # Output hidden states for talker
+      distributed_executor_backend: "mp"
+      enable_prefix_caching: false
+      max_num_batched_tokens: 32768
+      hf_config_name: thinker_config
+      tensor_parallel_size: 2
+    final_output: true
+    final_output_type: text
+    is_comprehension: true
+    default_sampling_params:
+      temperature: 0.4
+      top_p: 0.9
+      top_k: 1
+      max_tokens: 2048
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.05
+
+  - stage_id: 1
+    stage_type: llm # Use llm stage type to launch OmniLLM
+    runtime:
+      devices: "2"
+      max_batch_size: 1
+    engine_args:
+      model_stage: talker
+      model_arch: Qwen3OmniMoeForConditionalGeneration
+      worker_type: ar
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      gpu_memory_utilization: 0.3
+      enforce_eager: true
+      trust_remote_code: true
+      engine_output_type: latent # Output codec codes for code2wav
+      enable_prefix_caching: false
+      max_num_batched_tokens: 32768
+      distributed_executor_backend: "mp"
+      hf_config_name: talker_config
+    engine_input_source: [0]
+    custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker
+    # final_output: true
+    # final_output_type: text
+    default_sampling_params:
+      temperature: 0.9
+      top_k: 50
+      max_tokens: 4096
+      seed: 42
+      detokenize: False
+      repetition_penalty: 1.05
+      stop_token_ids: [2150]
+
+  - stage_id: 2
+    stage_type: llm # Use llm stage type to launch OmniLLM
+    runtime:
+      devices: "3"
+      max_batch_size: 1
+    engine_args:
+      model_stage: code2wav
+      model_arch: Qwen3OmniMoeForConditionalGeneration
+      worker_type: generation
+      scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
+      enforce_eager: true
+      trust_remote_code: true
+      enable_prefix_caching: false
+      engine_output_type: audio # Final output: audio waveform
+      gpu_memory_utilization: 0.1
+      distributed_executor_backend: "mp"
+      max_num_batched_tokens: 1000000
+      hf_config_name: thinker_config
+    engine_input_source: [1]
+    custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav
+    final_output: true
+    final_output_type: audio
+    default_sampling_params:
+      temperature: 0.0
+      top_p: 1.0
+      top_k: -1
+      max_tokens: 65536
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.1
diff --git a/vllm_omni/platforms/xpu/utils.py b/vllm_omni/platforms/xpu/utils.py
@@ -0,0 +1,16 @@
+from contextlib import contextmanager
+
+import torch
+
+
+@contextmanager
+def torch_cuda_wrapper():
+    try:
+        # replace cuda APIs with xpu APIs, this should work by default
+        torch.cuda.Stream = torch.xpu.Stream
+        torch.cuda.default_stream = torch.xpu.current_stream
+        torch.cuda.current_stream = torch.xpu.current_stream
+        torch.cuda.stream = torch.xpu.stream
+        yield
+    finally:
+        pass
diff --git a/vllm_omni/platforms/xpu/worker/__init__.py b/vllm_omni/platforms/xpu/worker/__init__.py
diff --git a/vllm_omni/platforms/xpu/worker/xpu_ar_model_runner.py b/vllm_omni/platforms/xpu/worker/xpu_ar_model_runner.py
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+from vllm_omni.platforms.xpu.utils import torch_cuda_wrapper
+from vllm_omni.worker.gpu_ar_model_runner import GPUARModelRunner
+
+
+class XPUARModelRunner(GPUARModelRunner):
+    def __init__(self, *args, **kwargs):
+        with torch_cuda_wrapper():
+            super().__init__(*args, **kwargs)
+
+    def _init_device_properties(self):
+        self.num_sms = None
+
+    def _sync_device(self) -> None:
+        torch.xpu.synchronize()
diff --git a/vllm_omni/platforms/xpu/worker/xpu_ar_worker.py b/vllm_omni/platforms/xpu/worker/xpu_ar_worker.py
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm.v1.worker.xpu_worker import XPUWorker
+
+from vllm_omni.platforms.xpu.worker.xpu_ar_model_runner import XPUARModelRunner
+from vllm_omni.worker.mixins import OmniWorkerMixin
+
+
+class XPUARWorker(OmniWorkerMixin, XPUWorker):
+    """XPU AR worker for thinker/talker stages in Omni model."""
+
+    def init_device(self):
+        super().init_device()
+        self.model_runner: XPUARModelRunner = XPUARModelRunner(self.vllm_config, self.device)
diff --git a/vllm_omni/platforms/xpu/worker/xpu_generation_model_runner.py b/vllm_omni/platforms/xpu/worker/xpu_generation_model_runner.py
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+from vllm_omni.platforms.xpu.utils import torch_cuda_wrapper
+from vllm_omni.worker.gpu_generation_model_runner import GPUGenerationModelRunner
+
+
+class XPUGenerationModelRunner(GPUGenerationModelRunner):
+    def __init__(self, *args, **kwargs):
+        with torch_cuda_wrapper():
+            super().__init__(*args, **kwargs)
+
+    def _init_device_properties(self):
+        self.num_sms = None
+
+    def _sync_device(self) -> None:
+        torch.xpu.synchronize()
diff --git a/vllm_omni/platforms/xpu/worker/xpu_generation_worker.py b/vllm_omni/platforms/xpu/worker/xpu_generation_worker.py
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm.v1.worker.xpu_worker import XPUWorker
+
+from vllm_omni.platforms.xpu.worker.xpu_generation_model_runner import XPUGenerationModelRunner
+from vllm_omni.worker.mixins import OmniWorkerMixin
+
+
+class XPUGenerationWorker(OmniWorkerMixin, XPUWorker):
+    """XPU generation worker for the code2wav (non-AR waveform generation) stage in the Omni model."""
+
+    def init_device(self):
+        super().init_device()
+        self.model_runner: XPUGenerationModelRunner = XPUGenerationModelRunner(self.vllm_config, self.device)