dorhuri123 · dorhuri123 · Jan 18, 2026 · Jan 18, 2026 · Jan 18, 2026 · Jan 18, 2026
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
@@ -34,6 +34,7 @@ th {
 |`StableDiffusion3Pipeline` | Stable-Diffusion-3 | `stabilityai/stable-diffusion-3.5-medium` |
 |`Flux2KleinPipeline` | FLUX.2-klein | `black-forest-labs/FLUX.2-klein-4B`, `black-forest-labs/FLUX.2-klein-9B` |
 |`StableAudioPipeline` | Stable-Audio-Open | `stabilityai/stable-audio-open-1.0` |
+|`AniSoraI2VCogVideoXPipeline` | AniSora-I2V | `Disty0/Index-anisora-5B-diffusers` |
 
 
 ## List of Supported Models for NPU

diff --git a/examples/offline_inference/image_to_video/README.md b/examples/offline_inference/image_to_video/README.md
@@ -1,8 +1,64 @@
 # Image-To-Video
 
-This example demonstrates how to generate videos from images using Wan2.2 Image-to-Video models with vLLM-Omni's offline inference API.
+This example demonstrates how to generate videos from images using vLLM-Omni's offline inference API.
 
-## Local CLI Usage
+## Supported Models
+
+- **Wan2.2-I2V-A14B-Diffusers** (MoE) - Alibaba's Wan2.2 14B MoE model
+- **Wan2.2-TI2V-5B-Diffusers** (Unified) - Alibaba's unified T2V+I2V 5B model
+- **AniSora V1 (5B)** - CogVideoX-based anime video generation
+- **AniSora V2/V3 (14B)** - Wan2.1-based anime video generation
+
+---
+
+## AniSora V1 (5B) - CogVideoX-based
+
+Optimized for anime-style video generation using CogVideoX architecture.
+
+```bash
+python anisora_image_to_video.py \
+  --model IndexTeam/AniSora-v1-i2v-diffusers \
+  --image input.png \
+  --prompt "anime girl walking, flowing hair, studio ghibli style" \
+  --height 480 \
+  --width 720 \
+  --num_frames 49 \
+  --guidance_scale 5.0 \
+  --num_inference_steps 50 \
+  --fps 16 \
+  --output anisora_v1.mp4
+```
+
+**Requirements:** ~24GB VRAM
+
+---
+
+## AniSora V2/V3 (14B) - Wan2.1-based
+
+High-quality anime video generation using Wan2.1 architecture with community weights.
+
+```bash
+python anisora_v2_image_to_video.py \
+  --image input.png \
+  --prompt "anime scene, high quality animation, smooth motion" \
+  --height 480 \
+  --width 832 \
+  --num-frames 49 \
+  --guidance-scale 5.0 \
+  --num-inference-steps 30 \
+  --fps 8 \
+  --output anisora_v2.mp4
+```
+
+**Requirements:** ~65GB VRAM for 14B model in bfloat16
+
+**Supported transformer models:**
+- `aardsoul-music/Wan2.1-Anisora-14B` (recommended)
+- `ikusa/anisorav2`
+
+---
+
+## Wan2.2 Models
 
 ### Wan2.2-I2V-A14B-Diffusers (MoE)
 ```bash

diff --git a/examples/offline_inference/image_to_video/anisora_image_to_video.py b/examples/offline_inference/image_to_video/anisora_image_to_video.py
@@ -0,0 +1,164 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+AniSora Image-to-Video generation example.
+
+Usage:
+    python anisora_image_to_video.py --model /path/to/anisora-diffusers \
+        --image input.jpg --prompt "A cat playing with yarn"
+"""
+
+import argparse
+from pathlib import Path
+
+import numpy as np
+import PIL.Image
+import torch
+
+from vllm_omni.entrypoints.omni import Omni
+from vllm_omni.outputs import OmniRequestOutput
+from vllm_omni.utils.platform_utils import detect_device_type, is_npu
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Generate a video from an image with AniSora I2V.")
+    parser.add_argument("--model", required=True, help="AniSora Diffusers I2V model ID or local path.")
+    parser.add_argument("--image", required=True, help="Path to input image.")
+    parser.add_argument("--prompt", default="", help="Text prompt describing the desired motion.")
+    parser.add_argument("--negative_prompt", default="", help="Negative prompt.")
+    parser.add_argument("--seed", type=int, default=42, help="Random seed.")
+    parser.add_argument("--guidance_scale", type=float, default=5.0, help="CFG scale.")
+    parser.add_argument(
+        "--height",
+        type=int,
+        default=None,
+        help="Video height (auto-calculated if not set).",
+    )
+    parser.add_argument(
+        "--width",
+        type=int,
+        default=None,
+        help="Video width (auto-calculated if not set).",
+    )
+    parser.add_argument("--num_frames", type=int, default=81, help="Number of frames.")
+    parser.add_argument("--num_inference_steps", type=int, default=50, help="Sampling steps.")
+    parser.add_argument("--flow_shift", type=float, default=5.0, help="Scheduler flow_shift.")
+    parser.add_argument(
+        "--output",
+        type=str,
+        default="anisora_i2v.mp4",
+        help="Path to save the video (mp4).",
+    )
+    parser.add_argument("--fps", type=int, default=16, help="Frames per second for the output video.")
+    return parser.parse_args()
+
+
+def calculate_dimensions(image: PIL.Image.Image, max_area: int = 480 * 832) -> tuple[int, int]:
+    aspect_ratio = image.height / image.width
+    mod_value = 16
+
+    height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
+    width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value
+
+    return height, width
+
+
+def main():
+    args = parse_args()
+    device = detect_device_type()
+    generator = torch.Generator(device=device).manual_seed(args.seed)
+
+    # Load input image
+    image = PIL.Image.open(args.image).convert("RGB")
+
+    # Calculate dimensions if not provided
+    height = args.height
+    width = args.width
+    if height is None or width is None:
+        calc_height, calc_width = calculate_dimensions(image, max_area=480 * 832)
+        height = height or calc_height
+        width = width or calc_width
+
+    # Resize image to target dimensions
+    image = image.resize((width, height), PIL.Image.Resampling.LANCZOS)
+
+    # Enable VAE memory optimizations on NPU
+    vae_use_slicing = is_npu()
+    vae_use_tiling = is_npu()
+
+    omni = Omni(
+        model=args.model,
+        vae_use_slicing=vae_use_slicing,
+        vae_use_tiling=vae_use_tiling,
+        flow_shift=args.flow_shift,
+    )
+
+    frames = omni.generate(
+        args.prompt,
+        negative_prompt=args.negative_prompt,
+        pil_image=image,
+        height=height,
+        width=width,
+        generator=generator,
+        guidance_scale=args.guidance_scale,
+        num_inference_steps=args.num_inference_steps,
+        num_frames=args.num_frames,
+    )
+
+    # Extract video frames from OmniRequestOutput
+    if isinstance(frames, list) and len(frames) > 0:
+        first_item = frames[0]
+
+        if hasattr(first_item, "final_output_type"):
+            if first_item.final_output_type != "image":
+                raise ValueError(
+                    f"Unexpected output type '{first_item.final_output_type}', expected 'image' for video generation."
+                )
+
+            if hasattr(first_item, "is_pipeline_output") and first_item.is_pipeline_output:
+                if isinstance(first_item.request_output, list) and len(first_item.request_output) > 0:
+                    inner_output = first_item.request_output[0]
+                    if isinstance(inner_output, OmniRequestOutput) and hasattr(inner_output, "images"):
+                        frames = inner_output.images[0] if inner_output.images else None
+                        if frames is None:
+                            raise ValueError("No video frames found in output.")
+            elif hasattr(first_item, "images") and first_item.images:
+                frames = first_item.images
+            else:
+                raise ValueError("No video frames found in OmniRequestOutput.")
+
+    output_path = Path(args.output)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    try:
+        from diffusers.utils import export_to_video
+    except ImportError:
+        raise ImportError("diffusers is required for export_to_video.")
+
+    if isinstance(frames, torch.Tensor):
+        video_tensor = frames.detach().cpu()
+        if video_tensor.dim() == 5:
+            if video_tensor.shape[1] in (3, 4):
+                video_tensor = video_tensor[0].permute(1, 2, 3, 0)
+            else:
+                video_tensor = video_tensor[0]
+        elif video_tensor.dim() == 4 and video_tensor.shape[0] in (3, 4):
+            video_tensor = video_tensor.permute(1, 2, 3, 0)
+        if video_tensor.is_floating_point():
+            video_tensor = video_tensor.clamp(-1, 1) * 0.5 + 0.5
+        video_array = video_tensor.float().numpy()
+    else:
+        video_array = frames
+        if hasattr(video_array, "shape") and video_array.ndim == 5:
+            video_array = video_array[0]
+
+    if isinstance(video_array, np.ndarray) and video_array.ndim == 4:
+        video_array = list(video_array)
+
+    export_to_video(video_array, str(output_path), fps=args.fps)
+    print(f"Saved generated video to {output_path}")
+
+
+if __name__ == "__main__":
+    main()