Skip to content
Closed
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
b792fa7
[Model] Scaffold Index AniSora pipelines and registry (WIP)
Jan 18, 2026
e0fbd94
feat: Implement AniSora T2V and I2V pipelines with examples and tests
Jan 18, 2026
c60dc64
fix: Resolve all linting errors in AniSora pipelines
Jan 18, 2026
994c9e9
remove: Delete documentation markdown files
Jan 18, 2026
7e1bd64
docs: Add comprehensive PR validation and testing notebook for Colab
Jan 18, 2026
5f70d3a
docs: Add comprehensive PR readiness and deployment guidelines
Jan 18, 2026
b6e5c28
docs: Add deployment status summary
Jan 18, 2026
dde1d6f
docs: Add quick start guide with direct answers
Jan 18, 2026
79c552f
fix: Apply pre-commit formatting (ruff format, trailing whitespace)
Jan 18, 2026
7bc3014
Add proper exports to anisora __init__.py following vLLM-Omni convent…
Jan 19, 2026
d1655f1
feat: Remove obsolete documentation and add new scripts for AniSora p…
Jan 19, 2026
ef08f9e
Support HTTP/HTTPS image URLs in I2V and T2V scripts
Jan 19, 2026
7501f0e
feat: Support HTTP/HTTPS image URLs in I2V and T2V scripts
Jan 19, 2026
b048fce
Override model_class_name to use AniSoraImageToVideoPipeline instead …
Jan 19, 2026
1b6b641
Increase stage_init_timeout to 1200s for model download and initializ…
Jan 19, 2026
b55c193
Add detailed phase logging to track progress through generation pipeline
Jan 19, 2026
c33fe27
Fix: use init_timeout instead of stage_init_timeout parameter
Jan 19, 2026
09b44da
Remove init_timeout parameter - use default 300s
Jan 19, 2026
911a272
[Model] Add Index-AniSora I2V support
Jan 20, 2026
69e9137
feat: Add AniSora V2/V3 (14B) support with hybrid Wan loading
Jan 20, 2026
1dc3dcc
fix: Handle AniSora transformer config mismatch for V2 loading
Jan 20, 2026
465bd49
fix: Simplify transformer loading - always use base config + weights
Jan 20, 2026
d4af658
fix: Add key name conversion for AniSora->diffusers format
Jan 20, 2026
29c1d8b
fix: Complete key name conversion for AniSora V2 -> diffusers
Jan 20, 2026
422d5ea
fix: Move all components to device during initialization
Jan 20, 2026
d03b142
docs: Add AniSora V1/V2 examples to image-to-video README
Jan 20, 2026
81f0eab
chore: Remove demo media files from repo
Jan 20, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/models/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ th {
|`StableDiffusion3Pipeline` | Stable-Diffusion-3 | `stabilityai/stable-diffusion-3.5-medium` |
|`Flux2KleinPipeline` | FLUX.2-klein | `black-forest-labs/FLUX.2-klein-4B`, `black-forest-labs/FLUX.2-klein-9B` |
|`StableAudioPipeline` | Stable-Audio-Open | `stabilityai/stable-audio-open-1.0` |
|`AniSoraPipeline` | AniSora-T2V | Index-Anisora/AniSora-v3.1-T2V (local) |
|`AniSoraImageToVideoPipeline` | AniSora-I2V | Index-Anisora/AniSora-v3.1-I2V (local) |


## List of Supported Models for NPU
Expand Down
149 changes: 149 additions & 0 deletions examples/offline_inference/image_to_video/anisora_image_to_video.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

"""
AniSora Image-to-Video generation example.

Usage:
python anisora_image_to_video.py --model /path/to/anisora-diffusers \
--image input.jpg --prompt "A cat playing with yarn"
"""

import argparse
from pathlib import Path

import numpy as np
import PIL.Image
import torch

from vllm_omni.entrypoints.omni import Omni
from vllm_omni.outputs import OmniRequestOutput
from vllm_omni.utils.platform_utils import detect_device_type, is_npu


def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Generate a video from an image with AniSora I2V.")
parser.add_argument("--model", required=True, help="AniSora Diffusers I2V model ID or local path.")
parser.add_argument("--image", required=True, help="Path to input image.")
parser.add_argument("--prompt", default="", help="Text prompt describing the desired motion.")
parser.add_argument("--negative_prompt", default="", help="Negative prompt.")
parser.add_argument("--seed", type=int, default=42, help="Random seed.")
parser.add_argument("--guidance_scale", type=float, default=5.0, help="CFG scale.")
parser.add_argument("--height", type=int, default=None, help="Video height (auto-calculated if not set).")
parser.add_argument("--width", type=int, default=None, help="Video width (auto-calculated if not set).")
parser.add_argument("--num_frames", type=int, default=81, help="Number of frames.")
parser.add_argument("--num_inference_steps", type=int, default=50, help="Sampling steps.")
parser.add_argument("--flow_shift", type=float, default=5.0, help="Scheduler flow_shift.")
parser.add_argument("--output", type=str, default="anisora_i2v.mp4", help="Path to save the video (mp4).")
parser.add_argument("--fps", type=int, default=16, help="Frames per second for the output video.")
return parser.parse_args()


def calculate_dimensions(image: PIL.Image.Image, max_area: int = 480 * 832) -> tuple[int, int]:
aspect_ratio = image.height / image.width
mod_value = 16

height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value

return height, width


def main():
args = parse_args()
device = detect_device_type()
generator = torch.Generator(device=device).manual_seed(args.seed)

# Load input image
image = PIL.Image.open(args.image).convert("RGB")

# Calculate dimensions if not provided
height = args.height
width = args.width
if height is None or width is None:
calc_height, calc_width = calculate_dimensions(image, max_area=480 * 832)
height = height or calc_height
width = width or calc_width

# Resize image to target dimensions
image = image.resize((width, height), PIL.Image.Resampling.LANCZOS)

# Enable VAE memory optimizations on NPU
vae_use_slicing = is_npu()
vae_use_tiling = is_npu()

omni = Omni(
model=args.model,
vae_use_slicing=vae_use_slicing,
vae_use_tiling=vae_use_tiling,
flow_shift=args.flow_shift,
)

frames = omni.generate(
args.prompt,
negative_prompt=args.negative_prompt,
pil_image=image,
height=height,
width=width,
generator=generator,
guidance_scale=args.guidance_scale,
num_inference_steps=args.num_inference_steps,
num_frames=args.num_frames,
)

# Extract video frames from OmniRequestOutput
if isinstance(frames, list) and len(frames) > 0:
first_item = frames[0]

if hasattr(first_item, "final_output_type"):
if first_item.final_output_type != "image":
raise ValueError(
f"Unexpected output type '{first_item.final_output_type}', expected 'image' for video generation."
)

if hasattr(first_item, "is_pipeline_output") and first_item.is_pipeline_output:
if isinstance(first_item.request_output, list) and len(first_item.request_output) > 0:
inner_output = first_item.request_output[0]
if isinstance(inner_output, OmniRequestOutput) and hasattr(inner_output, "images"):
frames = inner_output.images[0] if inner_output.images else None
if frames is None:
raise ValueError("No video frames found in output.")
elif hasattr(first_item, "images") and first_item.images:
frames = first_item.images
else:
raise ValueError("No video frames found in OmniRequestOutput.")

output_path = Path(args.output)
output_path.parent.mkdir(parents=True, exist_ok=True)

try:
from diffusers.utils import export_to_video
except ImportError:
raise ImportError("diffusers is required for export_to_video.")

if isinstance(frames, torch.Tensor):
video_tensor = frames.detach().cpu()
if video_tensor.dim() == 5:
if video_tensor.shape[1] in (3, 4):
video_tensor = video_tensor[0].permute(1, 2, 3, 0)
else:
video_tensor = video_tensor[0]
elif video_tensor.dim() == 4 and video_tensor.shape[0] in (3, 4):
video_tensor = video_tensor.permute(1, 2, 3, 0)
if video_tensor.is_floating_point():
video_tensor = video_tensor.clamp(-1, 1) * 0.5 + 0.5
video_array = video_tensor.float().numpy()
else:
video_array = frames
if hasattr(video_array, "shape") and video_array.ndim == 5:
video_array = video_array[0]

if isinstance(video_array, np.ndarray) and video_array.ndim == 4:
video_array = list(video_array)

export_to_video(video_array, str(output_path), fps=args.fps)
print(f"Saved generated video to {output_path}")


if __name__ == "__main__":
main()
131 changes: 131 additions & 0 deletions examples/offline_inference/text_to_video/anisora_text_to_video.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import argparse
from pathlib import Path

import numpy as np
import torch

from vllm_omni.entrypoints.omni import Omni
from vllm_omni.outputs import OmniRequestOutput
from vllm_omni.utils.platform_utils import detect_device_type, is_npu


def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Generate a video with AniSora T2V.")
parser.add_argument(
"--model",
required=True,
help="AniSora Diffusers model ID or local path.",
)
parser.add_argument("--prompt", required=True, help="Text prompt.")
parser.add_argument("--negative_prompt", default="", help="Negative prompt.")
parser.add_argument("--seed", type=int, default=42, help="Random seed.")
parser.add_argument("--guidance_scale", type=float, default=4.0, help="CFG scale (applied to low/high).")
parser.add_argument("--guidance_scale_high", type=float, default=None, help="Optional separate CFG for high-noise.")
parser.add_argument("--height", type=int, default=720, help="Video height.")
parser.add_argument("--width", type=int, default=1280, help="Video width.")
parser.add_argument("--num_frames", type=int, default=81, help="Number of frames.")
parser.add_argument("--num_inference_steps", type=int, default=40, help="Sampling steps.")
parser.add_argument("--boundary_ratio", type=float, default=0.875, help="Boundary split ratio for low/high DiT.")
parser.add_argument(
"--flow_shift", type=float, default=5.0, help="Scheduler flow_shift (5.0 for 720p, 12.0 for 480p)."
)
parser.add_argument("--output", type=str, default="anisora_t2v.mp4", help="Path to save the video (mp4).")
parser.add_argument("--fps", type=int, default=24, help="Frames per second for the output video.")
return parser.parse_args()


def main():
args = parse_args()
device = detect_device_type()
generator = torch.Generator(device=device).manual_seed(args.seed)

# Enable VAE memory optimizations on NPU
vae_use_slicing = is_npu()
vae_use_tiling = is_npu()

omni = Omni(
model=args.model,
vae_use_slicing=vae_use_slicing,
vae_use_tiling=vae_use_tiling,
boundary_ratio=args.boundary_ratio,
flow_shift=args.flow_shift,
)

frames = omni.generate(
args.prompt,
negative_prompt=args.negative_prompt,
height=args.height,
width=args.width,
generator=generator,
guidance_scale=args.guidance_scale,
guidance_scale_2=args.guidance_scale_high,
num_inference_steps=args.num_inference_steps,
num_frames=args.num_frames,
)

# Extract video frames from OmniRequestOutput
if isinstance(frames, list) and len(frames) > 0:
first_item = frames[0]

# Check if it's an OmniRequestOutput
if hasattr(first_item, "final_output_type"):
if first_item.final_output_type != "image":
raise ValueError(
f"Unexpected output type '{first_item.final_output_type}', expected 'image' for video generation."
)

# Pipeline mode: extract from nested request_output
if hasattr(first_item, "is_pipeline_output") and first_item.is_pipeline_output:
if isinstance(first_item.request_output, list) and len(first_item.request_output) > 0:
inner_output = first_item.request_output[0]
if isinstance(inner_output, OmniRequestOutput) and hasattr(inner_output, "images"):
frames = inner_output.images[0] if inner_output.images else None
if frames is None:
raise ValueError("No video frames found in output.")
# Diffusion mode: use direct images field
elif hasattr(first_item, "images") and first_item.images:
frames = first_item.images
else:
raise ValueError("No video frames found in OmniRequestOutput.")

output_path = Path(args.output)
output_path.parent.mkdir(parents=True, exist_ok=True)
try:
from diffusers.utils import export_to_video
except ImportError:
raise ImportError("diffusers is required for export_to_video.")

# frames may be np.ndarray (preferred) or torch.Tensor
# export_to_video expects a list of frames with values in [0, 1]
if isinstance(frames, torch.Tensor):
video_tensor = frames.detach().cpu()
if video_tensor.dim() == 5:
# [B, C, F, H, W] or [B, F, H, W, C]
if video_tensor.shape[1] in (3, 4):
video_tensor = video_tensor[0].permute(1, 2, 3, 0)
else:
video_tensor = video_tensor[0]
elif video_tensor.dim() == 4 and video_tensor.shape[0] in (3, 4):
video_tensor = video_tensor.permute(1, 2, 3, 0)
# If float, assume [-1,1] and normalize to [0,1]
if video_tensor.is_floating_point():
video_tensor = video_tensor.clamp(-1, 1) * 0.5 + 0.5
video_array = video_tensor.float().numpy()
else:
video_array = frames
if hasattr(video_array, "shape") and video_array.ndim == 5:
video_array = video_array[0]

# Convert 4D array (frames, H, W, C) to list of frames for export_to_video
if isinstance(video_array, np.ndarray) and video_array.ndim == 4:
video_array = list(video_array)

export_to_video(video_array, str(output_path), fps=args.fps)
print(f"Saved generated video to {output_path}")


if __name__ == "__main__":
main()
19 changes: 19 additions & 0 deletions tests/diffusion/models/test_anisora_registry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from vllm_omni.diffusion.registry import (
PIPELINE_REGISTRY,
DIFFUSION_PRE_PROCESS_MAP,
DIFFUSION_POST_PROCESS_MAP,
)


def test_anisora_registry_entries_present():
assert "AniSoraPipeline" in PIPELINE_REGISTRY
assert "AniSoraImageToVideoPipeline" in PIPELINE_REGISTRY

assert "AniSoraPipeline" in DIFFUSION_PRE_PROCESS_MAP
assert "AniSoraPipeline" in DIFFUSION_POST_PROCESS_MAP

assert "AniSoraImageToVideoPipeline" in DIFFUSION_PRE_PROCESS_MAP
assert "AniSoraImageToVideoPipeline" in DIFFUSION_POST_PROCESS_MAP
2 changes: 2 additions & 0 deletions vllm_omni/diffusion/models/anisora/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
Loading