diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 1c25a0054a..5d3219ff99 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -24,6 +24,18 @@ steps: agents: queue: "cpu_queue_premerge" + - label: "Image Generation API Test" + depends_on: image-build + commands: + - pytest -s -v tests/entrypoints/openai/ + agents: + queue: "cpu_queue_premerge" + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + always-pull: true + propagate-environment: true + - label: "Diffusion Model Test" timeout_in_minutes: 20 depends_on: image-build diff --git a/docs/.nav.yml b/docs/.nav.yml index 516c9d7129..4d077a0a3c 100644 --- a/docs/.nav.yml +++ b/docs/.nav.yml @@ -4,6 +4,9 @@ nav: - Getting Started: - getting_started/quickstart.md - getting_started/installation + - Serving: + - OpenAI-Compatible API: + - Image Generation: serving/image_generation_api.md - Examples: - examples/README.md - Offline Inference: diff --git a/docs/mkdocs/hooks/generate_api_readme.py b/docs/mkdocs/hooks/generate_api_readme.py index eb7bbfdbe6..57ddcadbfa 100644 --- a/docs/mkdocs/hooks/generate_api_readme.py +++ b/docs/mkdocs/hooks/generate_api_readme.py @@ -136,6 +136,15 @@ def scan_package(package_name: str = "vllm_omni") -> dict[str, list[str]]: relative_path = py_file.relative_to(ROOT_DIR) module_path = str(relative_path.with_suffix("")).replace("/", ".").replace("\\", ".") + # Skip excluded modules (avoid importing vllm during docs build) + excluded_prefixes = [ + "vllm_omni.diffusion.models.qwen_image", + "vllm_omni.entrypoints.async_diffusion", + "vllm_omni.entrypoints.openai", + ] + if any(module_path.startswith(prefix) for prefix in excluded_prefixes): + continue + # Handle __init__.py - use parent module path if py_file.name == "__init__.py": # Remove .__init__ from module path diff --git a/docs/serving/image_generation_api.md b/docs/serving/image_generation_api.md new file mode 100644 index 0000000000..113524efa2 --- /dev/null +++ b/docs/serving/image_generation_api.md @@ -0,0 +1,249 @@ +# Image Generation API + +vLLM-Omni provides an OpenAI DALL-E compatible API for text-to-image generation using diffusion models. + +Each server instance runs a single model (specified at startup via `vllm serve --omni`). + +## Quick Start + +### Start the Server + +For example... + +```bash +# Qwen-Image +vllm serve Qwen/Qwen-Image --omni --port 8000 + +# Z-Image Turbo +vllm serve Tongyi-MAI/Z-Image-Turbo --omni --port 8000 +``` + +### Generate Images + +**Using curl:** + +```bash +curl -X POST http://localhost:8000/v1/images/generations \ + -H "Content-Type: application/json" \ + -d '{ + "prompt": "a dragon laying over the spine of the Green Mountains of Vermont", + "size": "1024x1024", + "seed": 42 + }' | jq -r '.data[0].b64_json' | base64 -d > dragon.png +``` + +**Using Python:** + +```python +import requests +import base64 +from PIL import Image +import io + +response = requests.post( + "http://localhost:8000/v1/images/generations", + json={ + "prompt": "a black and white cat wearing a princess tiara", + "size": "1024x1024", + "num_inference_steps": 50, + "seed": 42, + } +) + +# Decode and save +img_data = response.json()["data"][0]["b64_json"] +img_bytes = base64.b64decode(img_data) +img = Image.open(io.BytesIO(img_bytes)) +img.save("cat.png") +``` + +**Using OpenAI SDK:** + +```python +from openai import OpenAI + +client = OpenAI(base_url="http://localhost:8000/v1", api_key="none") + +response = client.images.generate( + model="Qwen/Qwen-Image", + prompt="a horse jumping over a fence nearby a babbling brook", + n=1, + size="1024x1024", + response_format="b64_json" +) + +# Note: Extension parameters (seed, steps, cfg) require direct HTTP requests +``` + +## API Reference + +### Endpoint + +``` +POST /v1/images/generations +Content-Type: application/json +``` + +### Request Parameters + +#### OpenAI Standard Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `prompt` | string | **required** | Text description of the desired image | +| `model` | string | server's model | Model to use (optional, should match server if specified) | +| `n` | integer | 1 | Number of images to generate (1-10) | +| `size` | string | model defaults | Image dimensions in WxH format (e.g., "1024x1024", "512x512") | +| `response_format` | string | "b64_json" | Response format (only "b64_json" supported) | +| `user` | string | null | User identifier for tracking | + +#### vllm-omni Extension Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `negative_prompt` | string | null | Text describing what to avoid in the image | +| `num_inference_steps` | integer | model defaults | Number of diffusion steps | +| `guidance_scale` | float | model defaults | Classifier-free guidance scale (typically 0.0-20.0) | +| `true_cfg_scale` | float | model defaults | True CFG scale (model-specific parameter, may be ignored if not supported) | +| `seed` | integer | null | Random seed for reproducibility | + +### Response Format + +```json +{ + "created": 1701234567, + "data": [ + { + "b64_json": "", + "url": null, + "revised_prompt": null + } + ] +} +``` + +## Examples + +### Multiple Images + +```bash +curl -X POST http://localhost:8000/v1/images/generations \ + -H "Content-Type: application/json" \ + -d '{ + "prompt": "a steampunk city set in a valley of the Adirondack mountains", + "n": 4, + "size": "1024x1024", + "seed": 123 + }' +``` + +This generates 4 images in a single request. + +### With Negative Prompt + +```python +response = requests.post( + "http://localhost:8000/v1/images/generations", + json={ + "prompt": "a portrait of a skier in deep powder snow", + "negative_prompt": "blurry, low quality, distorted, ugly", + "num_inference_steps": 100, + "size": "1024x1024", + } +) +``` + +## Parameter Handling + +The API passes parameters directly to the diffusion pipeline without model-specific transformation: + +- **Default values**: When parameters are not specified, the underlying model uses its own defaults +- **Pass-through design**: User-provided values are forwarded directly to the diffusion engine +- **Minimal validation**: Only basic type checking and range validation at the API level + +### Parameter Compatibility + +The API passes parameters directly to the diffusion pipeline without model-specific validation. + +- Unsupported parameters may be silently ignored by the model +- Incompatible values will result in errors from the underlying pipeline +- Recommended values vary by model - consult model documentation + +**Best Practice:** Start with the model's recommended parameters, then adjust based on your needs. + +## Error Responses + +### 400 Bad Request + +Invalid parameters (e.g., model mismatch): + +```json +{ + "detail": "Invalid size format: '1024x'. Expected format: 'WIDTHxHEIGHT' (e.g., '1024x1024')." +} +``` + +### 422 Unprocessable Entity + +Validation errors (missing required fields): + +```json +{ + "detail": [ + { + "loc": ["body", "prompt"], + "msg": "field required", + "type": "value_error.missing" + } + ] +} +``` + +### 503 Service Unavailable + +Diffusion engine not initialized: + +```json +{ + "detail": "Diffusion engine not initialized. Start server with a diffusion model." +} +``` + +## Troubleshooting + +### Server Not Running + +```bash +# Check if server is responding +curl http://localhost:8000/v1/images/generations \ + -H "Content-Type: application/json" \ + -d '{"prompt": "test"}' +``` + +### Out of Memory + +If you encounter OOM errors: +1. Reduce image size: `"size": "512x512"` +2. Reduce inference steps: `"num_inference_steps": 25` +3. Generate fewer images: `"n": 1` + +## Testing + +Run the test suite to verify functionality: + +```bash +# All image generation tests +pytest tests/entrypoints/openai/test_image_server.py -v + +# Specific test +pytest tests/entrypoints/openai/test_image_server.py::test_generate_single_image -v +``` + +## Development + +Enable debug logging to see prompts and generation details: + +```bash +vllm serve Qwen/Qwen-Image --omni \ + --uvicorn-log-level debug +``` diff --git a/mkdocs.yml b/mkdocs.yml index 407ec4ee98..4d4fbbb605 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -89,6 +89,9 @@ plugins: exclude: - "re:vllm_omni\\._.*" # Internal modules - "vllm_omni.diffusion.models.qwen_image" # avoid importing vllm in mkdocs building + - "vllm_omni.entrypoints.async_diffusion" # avoid importing vllm in mkdocs building + - "vllm_omni.entrypoints.openai" # avoid importing vllm in mkdocs building + - "vllm_omni.entrypoints.openai.protocol" # avoid importing vllm in mkdocs building - mkdocstrings: handlers: python: diff --git a/tests/entrypoints/openai/test_image_server.py b/tests/entrypoints/openai/test_image_server.py new file mode 100644 index 0000000000..524aed726b --- /dev/null +++ b/tests/entrypoints/openai/test_image_server.py @@ -0,0 +1,439 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Tests for async image generation API endpoints. + +This module contains unit tests and integration tests (with mocking) for the +OpenAI-compatible async text-to-image generation API endpoints in api_server.py. +""" + +import base64 +import io +from unittest.mock import AsyncMock, Mock + +import pytest +from fastapi.testclient import TestClient +from PIL import Image + +from vllm_omni.entrypoints.openai.image_api_utils import ( + encode_image_base64, + parse_size, +) + +# Unit Tests + + +def test_parse_size_valid(): + """Test size parsing with valid inputs""" + assert parse_size("1024x1024") == (1024, 1024) + assert parse_size("512x768") == (512, 768) + assert parse_size("256x256") == (256, 256) + assert parse_size("1792x1024") == (1792, 1024) + assert parse_size("1024x1792") == (1024, 1792) + + +def test_parse_size_invalid(): + """Test size parsing with invalid inputs""" + with pytest.raises(ValueError, match="Invalid size format"): + parse_size("invalid") + + with pytest.raises(ValueError, match="Invalid size format"): + parse_size("1024") + + with pytest.raises(ValueError, match="Invalid size format"): + parse_size("1024x") + + with pytest.raises(ValueError, match="Invalid size format"): + parse_size("x1024") + + +def test_parse_size_negative(): + """Test size parsing with negative or zero dimensions""" + with pytest.raises(ValueError, match="positive integers"): + parse_size("0x1024") + + with pytest.raises(ValueError, match="positive integers"): + parse_size("1024x0") + + with pytest.raises(ValueError): + parse_size("-1024x1024") + + +def test_parse_size_edge_cases(): + """Test size parsing with edge cases like empty strings and non-integers""" + # Empty string + with pytest.raises(ValueError, match="non-empty string"): + parse_size("") + + # Non-integer dimensions + with pytest.raises(ValueError, match="must be integers"): + parse_size("abc x def") + + with pytest.raises(ValueError, match="must be integers"): + parse_size("1024.5x768.5") + + # Missing separator (user might forget 'x') + with pytest.raises(ValueError, match="separator"): + parse_size("1024 1024") + + +def test_encode_image_base64(): + """Test image encoding to base64""" + # Create a simple test image + img = Image.new("RGB", (64, 64), color="red") + b64_str = encode_image_base64(img) + + # Should be valid base64 + assert isinstance(b64_str, str) + assert len(b64_str) > 0 + + # Should decode back to PNG + decoded = base64.b64decode(b64_str) + decoded_img = Image.open(io.BytesIO(decoded)) + + # Verify properties + assert decoded_img.size == (64, 64) + assert decoded_img.format == "PNG" + + +# Integration Tests (with mocking) + + +class MockGenerationResult: + """Mock result object from AsyncOmniDiffusion.generate()""" + + def __init__(self, images): + self.images = images + + +@pytest.fixture +def mock_async_diffusion(): + """Mock AsyncOmniDiffusion instance that returns fake images""" + mock = Mock() + + async def generate(**kwargs): + # Return n PIL images wrapped in result object + n = kwargs.get("num_outputs_per_prompt", 1) + images = [Image.new("RGB", (64, 64), color="blue") for _ in range(n)] + return MockGenerationResult(images) + + mock.generate = AsyncMock(side_effect=generate) + return mock + + +@pytest.fixture +def test_client(mock_async_diffusion): + """Create test client with mocked async diffusion engine""" + from fastapi import FastAPI + + from vllm_omni.entrypoints.openai.api_server import router + + app = FastAPI() + app.include_router(router) + + # Set up app state with diffusion engine + app.state.diffusion_engine = mock_async_diffusion + app.state.diffusion_model_name = "Qwen/Qwen-Image" + + return TestClient(app) + + +@pytest.mark.skip(reason="Async API server uses different health check mechanism") +def test_health_endpoint(test_client): + """Test health check endpoint - skipped for async server""" + pass + + +def test_generate_single_image(test_client): + """Test generating a single image""" + response = test_client.post( + "/v1/images/generations", + json={ + "prompt": "a cat", + "n": 1, + "size": "1024x1024", + }, + ) + assert response.status_code == 200 + data = response.json() + + # Check response structure + assert "created" in data + assert isinstance(data["created"], int) + assert "data" in data + assert len(data["data"]) == 1 + assert "b64_json" in data["data"][0] + + # Verify image can be decoded + img_bytes = base64.b64decode(data["data"][0]["b64_json"]) + img = Image.open(io.BytesIO(img_bytes)) + assert img.size == (64, 64) # Our mock returns 64x64 images + + +def test_generate_multiple_images(test_client): + """Test generating multiple images""" + response = test_client.post( + "/v1/images/generations", + json={ + "prompt": "a dog", + "n": 3, + "size": "512x512", + }, + ) + assert response.status_code == 200 + data = response.json() + assert len(data["data"]) == 3 + + # All images should be valid + for img_data in data["data"]: + assert "b64_json" in img_data + img_bytes = base64.b64decode(img_data["b64_json"]) + img = Image.open(io.BytesIO(img_bytes)) + assert img.format == "PNG" + + +def test_with_negative_prompt(test_client): + """Test with negative prompt""" + response = test_client.post( + "/v1/images/generations", + json={ + "prompt": "beautiful landscape", + "negative_prompt": "blurry, low quality", + "size": "1024x1024", + }, + ) + assert response.status_code == 200 + + +def test_with_seed(test_client): + """Test with seed for reproducibility""" + response = test_client.post( + "/v1/images/generations", + json={ + "prompt": "a tree", + "seed": 42, + "size": "1024x1024", + }, + ) + assert response.status_code == 200 + + +def test_with_custom_parameters(test_client): + """Test with custom diffusion parameters""" + response = test_client.post( + "/v1/images/generations", + json={ + "prompt": "a mountain", + "size": "1024x1024", + "num_inference_steps": 100, + "true_cfg_scale": 5.5, + "seed": 123, + }, + ) + assert response.status_code == 200 + + +def test_invalid_size(test_client): + """Test with invalid size parameter - rejected by Pydantic""" + response = test_client.post( + "/v1/images/generations", + json={ + "prompt": "a cat", + "size": "invalid", + }, + ) + # Pydantic validation errors return 422 (Unprocessable Entity) + # "invalid" has no "x" so Pydantic rejects it + assert response.status_code == 422 + # Check error detail contains size validation message + detail = str(response.json()["detail"]) + assert "size" in detail.lower() or "invalid" in detail.lower() + + +def test_invalid_size_parse_error(test_client): + """Test with malformed size - passes Pydantic but fails parse_size()""" + response = test_client.post( + "/v1/images/generations", + json={ + "prompt": "a cat", + "size": "1024x", # Has "x" so Pydantic accepts, but parse_size() rejects + }, + ) + # parse_size() raises ValueError → endpoint converts to 400 (Bad Request) + assert response.status_code == 400 + detail = str(response.json()["detail"]) + assert "size" in detail.lower() or "invalid" in detail.lower() + + +def test_missing_prompt(test_client): + """Test with missing required prompt field""" + response = test_client.post( + "/v1/images/generations", + json={ + "size": "1024x1024", + }, + ) + # Pydantic validation error + assert response.status_code == 422 + + +def test_invalid_n_parameter(test_client): + """Test with invalid n parameter (out of range)""" + # n < 1 + response = test_client.post( + "/v1/images/generations", + json={ + "prompt": "a cat", + "n": 0, + }, + ) + assert response.status_code == 422 + + # n > 10 + response = test_client.post( + "/v1/images/generations", + json={ + "prompt": "a cat", + "n": 11, + }, + ) + assert response.status_code == 422 + + +def test_url_response_format_not_supported(test_client): + """Test that URL format returns error""" + response = test_client.post( + "/v1/images/generations", + json={ + "prompt": "a cat", + "response_format": "url", + }, + ) + # Pydantic validation errors return 422 (Unprocessable Entity) + assert response.status_code == 422 + # Check error mentions response_format or b64_json + detail = str(response.json()["detail"]) + assert "b64_json" in detail.lower() or "response" in detail.lower() + + +def test_model_not_loaded(): + """Test error when diffusion engine is not initialized""" + from fastapi import FastAPI + + from vllm_omni.entrypoints.openai.api_server import router + + app = FastAPI() + app.include_router(router) + # Don't set diffusion_engine to simulate uninitialized state + app.state.diffusion_engine = None + + client = TestClient(app) + response = client.post( + "/v1/images/generations", + json={ + "prompt": "a cat", + }, + ) + assert response.status_code == 503 + assert "not initialized" in response.json()["detail"].lower() + + +def test_different_image_sizes(test_client): + """Test various valid image sizes""" + sizes = ["256x256", "512x512", "1024x1024", "1792x1024", "1024x1792"] + + for size in sizes: + response = test_client.post( + "/v1/images/generations", + json={ + "prompt": "a test image", + "size": size, + }, + ) + assert response.status_code == 200, f"Failed for size {size}" + + +def test_parameter_validation(): + """Test Pydantic model validation""" + from vllm_omni.entrypoints.openai.protocol.images import ImageGenerationRequest + + # Valid request - optional parameters default to None + req = ImageGenerationRequest(prompt="test") + assert req.prompt == "test" + assert req.n == 1 + assert req.model is None + assert req.size is None # Engine will use model defaults + assert req.num_inference_steps is None # Engine will use model defaults + assert req.true_cfg_scale is None # Engine will use model defaults + + # Invalid num_inference_steps (out of range) + with pytest.raises(ValueError): + ImageGenerationRequest(prompt="test", num_inference_steps=0) + + with pytest.raises(ValueError): + ImageGenerationRequest(prompt="test", num_inference_steps=201) + + # Invalid guidance_scale (out of range) + with pytest.raises(ValueError): + ImageGenerationRequest(prompt="test", guidance_scale=-1.0) + + with pytest.raises(ValueError): + ImageGenerationRequest(prompt="test", guidance_scale=21.0) + + +# Pass-Through Tests + + +def test_parameters_passed_through(test_client, mock_async_diffusion): + """Verify all parameters passed through without modification""" + response = test_client.post( + "/v1/images/generations", + json={ + "prompt": "test", + "num_inference_steps": 100, + "guidance_scale": 7.5, + "true_cfg_scale": 3.0, + "seed": 42, + }, + ) + assert response.status_code == 200 + + # Ensure generate() was called exactly once + mock_async_diffusion.generate.assert_awaited_once() + call_kwargs = mock_async_diffusion.generate.call_args[1] + assert call_kwargs["num_inference_steps"] == 100 + assert call_kwargs["guidance_scale"] == 7.5 + assert call_kwargs["true_cfg_scale"] == 3.0 + assert call_kwargs["seed"] == 42 + + +def test_optional_parameters_omitted(test_client, mock_async_diffusion): + """Verify optional parameters not passed when omitted""" + response = test_client.post( + "/v1/images/generations", + json={ + "prompt": "test", + "size": "512x512", + }, + ) + assert response.status_code == 200 + + # Ensure generate() was called exactly once + mock_async_diffusion.generate.assert_awaited_once() + call_kwargs = mock_async_diffusion.generate.call_args[1] + assert "num_inference_steps" not in call_kwargs + assert "guidance_scale" not in call_kwargs + assert "true_cfg_scale" not in call_kwargs + + +def test_model_field_omitted_works(test_client): + """Test that omitting model field works""" + response = test_client.post( + "/v1/images/generations", + json={ + "prompt": "test", + "size": "1024x1024", + # model field omitted + }, + ) + assert response.status_code == 200 diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py index ca30bb901e..61f4079ab6 100644 --- a/vllm_omni/entrypoints/openai/api_server.py +++ b/vllm_omni/entrypoints/openai/api_server.py @@ -2,6 +2,9 @@ import multiprocessing import multiprocessing.forkserver as forkserver import os + +# Image generation API imports +import time from argparse import Namespace from collections.abc import AsyncIterator from contextlib import asynccontextmanager @@ -44,11 +47,23 @@ from vllm_omni.diffusion.utils.hf_utils import is_diffusion_model from vllm_omni.entrypoints.async_diffusion import AsyncOmniDiffusion from vllm_omni.entrypoints.async_omni import AsyncOmni +from vllm_omni.entrypoints.openai.image_api_utils import ( + encode_image_base64, + parse_size, +) +from vllm_omni.entrypoints.openai.protocol.images import ( + ImageData, + ImageGenerationRequest, + ImageGenerationResponse, +) from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat logger = init_logger(__name__) +# Server entry points + + async def omni_run_server(args, **uvicorn_kwargs) -> None: """Run a single-worker API server. @@ -465,6 +480,7 @@ async def omni_diffusion_init_app_state( model_name = served_model_names[0] if served_model_names else args.model state.diffusion_engine = diffusion_engine + state.diffusion_model_name = model_name # Store for image endpoints state.log_stats = not getattr(args, "disable_log_stats", False) # Initialize chat handler with diffusion engine (uses /v1/chat/completions endpoint) @@ -516,3 +532,108 @@ async def create_chat_completion(request: ChatCompletionRequest, raw_request: Re return JSONResponse(content=generator.model_dump()) return StreamingResponse(content=generator, media_type="text/event-stream") + + +# Image generation API endpoints + + +@router.post( + "/v1/images/generations", + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.OK.value: {"model": ImageGenerationResponse}, + HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, + HTTPStatus.SERVICE_UNAVAILABLE.value: {"model": ErrorResponse}, + HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, + }, +) +async def generate_images(request: ImageGenerationRequest, raw_request: Request) -> ImageGenerationResponse: + """Generate images from text prompts using diffusion models. + + OpenAI DALL-E compatible endpoint for text-to-image generation. + + Args: + request: Image generation request with prompt and parameters + raw_request: Raw FastAPI request for accessing app state + + Returns: + ImageGenerationResponse with generated images as base64 PNG + + Raises: + HTTPException: For validation errors, missing engine, or generation failures + """ + # Get diffusion engine from app state + diffusion_engine: AsyncOmniDiffusion | None = getattr(raw_request.app.state, "diffusion_engine", None) + if diffusion_engine is None: + raise HTTPException( + status_code=HTTPStatus.SERVICE_UNAVAILABLE.value, + detail="Diffusion engine not initialized. Start server with a diffusion model.", + ) + + # Get server's loaded model + model_name = getattr(raw_request.app.state, "diffusion_model_name", "unknown") + + # Validate model field (warn if mismatch, don't error) + if request.model is not None and request.model != model_name: + logger.warning( + f"Model mismatch: request specifies '{request.model}' but " + f"server is running '{model_name}'. Using server model." + ) + + try: + # Build params - pass through user values directly + gen_params = { + "prompt": request.prompt, + "num_outputs_per_prompt": request.n, + } + + # Parse and add size if provided + if request.size: + width, height = parse_size(request.size) + gen_params["height"] = height + gen_params["width"] = width + size_str = f"{width}x{height}" + else: + size_str = "model default" + + # Add optional parameters ONLY if provided + if request.num_inference_steps is not None: + gen_params["num_inference_steps"] = request.num_inference_steps + if request.negative_prompt is not None: + gen_params["negative_prompt"] = request.negative_prompt + if request.guidance_scale is not None: + gen_params["guidance_scale"] = request.guidance_scale + if request.true_cfg_scale is not None: + gen_params["true_cfg_scale"] = request.true_cfg_scale + if request.seed is not None: + gen_params["seed"] = request.seed + + logger.info(f"Generating {request.n} image(s) {size_str}") + + # Generate images using AsyncOmniDiffusion + result = await diffusion_engine.generate(**gen_params) + + # Extract images from result + images = result.images if hasattr(result, "images") else [] + + logger.info(f"Successfully generated {len(images)} image(s)") + + # Encode images to base64 + image_data = [ImageData(b64_json=encode_image_base64(img), revised_prompt=None) for img in images] + + return ImageGenerationResponse( + created=int(time.time()), + data=image_data, + ) + + except HTTPException: + # Re-raise HTTPExceptions as-is + raise + except ValueError as e: + logger.error(f"Validation error: {e}") + raise HTTPException(status_code=HTTPStatus.BAD_REQUEST.value, detail=str(e)) + except Exception as e: + logger.exception(f"Image generation failed: {e}") + raise HTTPException( + status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=f"Image generation failed: {str(e)}" + ) diff --git a/vllm_omni/entrypoints/openai/image_api_utils.py b/vllm_omni/entrypoints/openai/image_api_utils.py new file mode 100644 index 0000000000..7a9d8fa524 --- /dev/null +++ b/vllm_omni/entrypoints/openai/image_api_utils.py @@ -0,0 +1,65 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Shared helper utilities for OpenAI-compatible image generation API. + +This module provides common helper functions for the image generation endpoint. +All functions work with plain Python types to maintain separation from the +FastAPI HTTP layer. +""" + +import base64 +import io + +import PIL.Image + + +def parse_size(size_str: str) -> tuple[int, int]: + """Parse size string to width and height tuple. + + Args: + size_str: Size in format "WIDTHxHEIGHT" (e.g., "1024x1024") + + Returns: + Tuple of (width, height) + + Raises: + ValueError: If size format is invalid + """ + if not size_str or not isinstance(size_str, str): + raise ValueError( + f"Size must be a non-empty string in format 'WIDTHxHEIGHT' (e.g., '1024x1024'), got: {size_str}" + ) + + parts = size_str.split("x") + if len(parts) != 2: + raise ValueError( + f"Invalid size format: '{size_str}'. Expected format: 'WIDTHxHEIGHT' (e.g., '1024x1024'). " + f"Did you mean to use 'x' as separator?" + ) + + try: + width = int(parts[0]) + height = int(parts[1]) + except ValueError: + raise ValueError(f"Invalid size format: '{size_str}'. Width and height must be integers.") + + if width <= 0 or height <= 0: + raise ValueError(f"Invalid size: {width}x{height}. Width and height must be positive integers.") + + return width, height + + +def encode_image_base64(image: PIL.Image.Image) -> str: + """Encode PIL Image to base64 PNG string. + + Args: + image: PIL Image object + + Returns: + Base64-encoded PNG image as string + """ + buffer = io.BytesIO() + image.save(buffer, format="PNG") + buffer.seek(0) + return base64.b64encode(buffer.read()).decode("utf-8") diff --git a/vllm_omni/entrypoints/openai/protocol/__init__.py b/vllm_omni/entrypoints/openai/protocol/__init__.py new file mode 100644 index 0000000000..b17b648eb6 --- /dev/null +++ b/vllm_omni/entrypoints/openai/protocol/__init__.py @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from vllm_omni.entrypoints.openai.protocol.images import ( + ImageData, + ImageGenerationRequest, + ImageGenerationResponse, + ResponseFormat, +) + +__all__ = [ + "ImageData", + "ImageGenerationRequest", + "ImageGenerationResponse", + "ResponseFormat", +] diff --git a/vllm_omni/entrypoints/openai/protocol/images.py b/vllm_omni/entrypoints/openai/protocol/images.py new file mode 100644 index 0000000000..cb7c346ac7 --- /dev/null +++ b/vllm_omni/entrypoints/openai/protocol/images.py @@ -0,0 +1,112 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +OpenAI-compatible protocol definitions for image generation. + +This module provides Pydantic models that follow the OpenAI DALL-E API specification +for text-to-image generation, with vllm-omni specific extensions. +""" + +from enum import Enum + +from pydantic import BaseModel, Field, field_validator + + +class ResponseFormat(str, Enum): + """Image response format""" + + B64_JSON = "b64_json" + URL = "url" # Not implemented in PoC + + +class ImageGenerationRequest(BaseModel): + """ + OpenAI DALL-E compatible image generation request. + + Follows the OpenAI Images API specification with vllm-omni extensions + for advanced diffusion parameters. + """ + + # Required fields + prompt: str = Field(..., description="Text description of the desired image(s)") + + # OpenAI standard fields + model: str | None = Field( + default=None, + description="Model to use (optional, uses server's configured model if omitted)", + ) + n: int = Field(default=1, ge=1, le=10, description="Number of images to generate") + size: str | None = Field( + default=None, + description="Image dimensions in WIDTHxHEIGHT format (e.g., '1024x1024', uses model defaults if omitted)", + ) + response_format: ResponseFormat = Field(default=ResponseFormat.B64_JSON, description="Format of the returned image") + user: str | None = Field(default=None, description="User identifier for tracking") + + @field_validator("size") + @classmethod + def validate_size(cls, v): + """Validate size parameter. + + Accepts any string in 'WIDTHxHEIGHT' format (e.g., '1024x1024', '512x768'). + No restrictions on specific dimensions - models can handle arbitrary sizes. + """ + if v is None: + return None + # Validate string format + if not isinstance(v, str) or "x" not in v: + raise ValueError("size must be in format 'WIDTHxHEIGHT' (e.g., '1024x1024')") + return v + + @field_validator("response_format") + @classmethod + def validate_response_format(cls, v): + """Validate response format - only b64_json is supported.""" + if v is not None and v != ResponseFormat.B64_JSON: + raise ValueError(f"Only 'b64_json' response format is supported, got: {v}") + return v + + # vllm-omni extensions for diffusion control + negative_prompt: str | None = Field(default=None, description="Text describing what to avoid in the image") + num_inference_steps: int | None = Field( + default=None, + ge=1, + le=200, + description="Number of diffusion sampling steps (uses model defaults if not specified)", + ) + guidance_scale: float | None = Field( + default=None, + ge=0.0, + le=20.0, + description="Classifier-free guidance scale (uses model defaults if not specified)", + ) + true_cfg_scale: float | None = Field( + default=None, + ge=0.0, + le=20.0, + description="True CFG scale (model-specific parameter, may be ignored if not supported)", + ) + seed: int | None = Field(default=None, description="Random seed for reproducibility") + + # VAE memory optimizations (set at model init, included for completeness) + vae_use_slicing: bool | None = Field(default=False, description="Enable VAE slicing") + vae_use_tiling: bool | None = Field(default=False, description="Enable VAE tiling") + + +class ImageData(BaseModel): + """Single generated image data""" + + b64_json: str | None = Field(default=None, description="Base64-encoded PNG image") + url: str | None = Field(default=None, description="Image URL (not implemented)") + revised_prompt: str | None = Field(default=None, description="Revised prompt (OpenAI compatibility, always null)") + + +class ImageGenerationResponse(BaseModel): + """ + OpenAI DALL-E compatible image generation response. + + Returns generated images with metadata. + """ + + created: int = Field(..., description="Unix timestamp of when the generation completed") + data: list[ImageData] = Field(..., description="Array of generated images")