diff --git a/src/litserve/api.py b/src/litserve/api.py index 28e375e4..d806a860 100644 --- a/src/litserve/api.py +++ b/src/litserve/api.py @@ -29,6 +29,104 @@ class LitAPI(ABC): + """Define inference logic for the model. + + LitAPI is the core abstraction for serving AI models with LitServe. It provides a clean + interface for model loading, request processing, and response generation with automatic + optimizations like batching, streaming, and async processing. + + Core Workflow: + 1. **setup()**: Load and initialize the model once per worker + 2. **decode_request()**: Convert HTTP request to model input format + 3. **predict()**: Run model inference on the input + 4. **encode_response()**: Convert model output to HTTP response format + + Quick Start: + ```python + import litserve as ls + + class MyAPI(ls.LitAPI): + def setup(self, device): + self.model = lambda x: x**2 + + def predict(self, x): + return self.model(x["input"]) + + server = ls.LitServer(MyAPI()) + server.run() + ``` + + Required Methods: + setup(device): Initialize the model and resources + predict(x): Core inference logic + + Optional Methods: + decode_request(request): Transform HTTP requests to model input + encode_response(output): Transform model outputs to HTTP responses + batch(inputs)/unbatch(outputs): Custom batching logic + + Configuration: + max_batch_size: Batch multiple requests for better GPU utilization. Defaults to 1. + batch_timeout: Wait time for batch to fill (seconds). Defaults to 0.0. + stream: Enable streaming responses for real-time output. Defaults to False. + api_path: URL endpoint path. Defaults to "/predict". + enable_async: Enable async/await for non-blocking operations. Defaults to False. + spec: API specification (e.g., OpenAISpec for OpenAI compatibility). Defaults to None. + mcp: Model Context Protocol integration for AI assistants. Defaults to None. + + Examples: + Batched GPU Inference: + ```python + class BatchedAPI(ls.LitAPI): + def setup(self, device): + self.model = load_model().to(device) + + def predict(self, batch): + return self.model(batch) + + api = BatchedAPI(max_batch_size=8, batch_timeout=0.1) + ``` + + Streaming LLM: + ```python + class StreamingLLM(ls.LitAPI): + def setup(self, device): + self.model = load_llm() + + def predict(self, prompt): + for token in self.model.generate_stream(prompt): + yield token + + api = StreamingLLM(stream=True) + ``` + + OpenAI-Compatible: + ```python + from litserve.specs import OpenAISpec + + class ChatAPI(ls.LitAPI): + def setup(self, device): + self.model = load_chat_model() + + def predict(self, messages): + return self.model.chat(messages) + + api = ChatAPI(spec=OpenAISpec()) + ``` + + Performance Tips: + - Use batching for GPU models to maximize utilization + - Enable streaming for operations taking >1 second + - Use async for I/O-bound operations (databases, external APIs) + - Load models in setup(), not __init__ + - Monitor GPU memory usage with larger batch sizes + + See Also: + - LitServer: Server class for hosting APIs + - LitSpec: API specifications for standard interfaces + + """ + _stream: bool = False _default_unbatch: Optional[Callable] = None _spec: Optional[LitSpec] = None @@ -47,60 +145,7 @@ def __init__( mcp: Optional["MCP"] = None, enable_async: bool = False, ): - """Initialize a LitAPI instance that defines the model's inference behavior. - - Args: - max_batch_size (int, optional): - Maximum requests to batch together for inference. Higher values improve throughput - for models that benefit from batching but use more memory. Defaults to 1. - - batch_timeout (float, optional): - Maximum seconds to wait for a batch to fill before processing incomplete batches. - Lower values reduce latency, higher values improve batching efficiency. Defaults to 0.0. - - api_path (str, optional): - URL endpoint path for predictions (e.g., "/predict", "/v1/chat"). Defaults to "/predict". - - stream (bool, optional): - Enable streaming responses for real-time output (useful for LLMs, long-running tasks). - Requires implementing encode_response() for streaming. Defaults to False. - - loop (Union[str, LitLoop], optional): - Inference loop strategy. "auto" selects optimal loop based on batching/streaming settings, - or provide custom LitLoop instance for advanced control. Defaults to "auto". - - spec (LitSpec, optional): - API specification defining input/output schemas and behavior. Use OpenAISpec for - OpenAI-compatible APIs or custom LitSpec implementations. Defaults to None. - - mcp (MCP, optional): - Enable MCP server for the API. Provide tool description and input schema. Defaults to None. - - enable_async (bool, optional): - Enable async/await support for non-blocking operations in predict() method. - Useful for I/O-bound inference or external API calls. Defaults to False. - - Example: - >>> # Simple API - >>> api = LitAPI() - - >>> # Batched inference - >>> api = LitAPI(max_batch_size=8, batch_timeout=0.1) - - >>> # OpenAI-compatible API - >>> api = LitAPI(spec=OpenAISpec()) - - >>> # Async processing - >>> api = LitAPI(enable_async=True) - - >>> # MCP server - >>> api = LitAPI(mcp=MCP(description="A simple API", input_schema={"name": "string"})) - - Note: - You must implement setup(), predict(), and optionally decode_request()/encode_response() - methods to define your model's behavior. - - """ + """Initialize LitAPI with configuration options.""" if max_batch_size <= 0: raise ValueError("max_batch_size must be greater than 0") diff --git a/src/litserve/mcp.py b/src/litserve/mcp.py index 8e80c66a..99023060 100644 --- a/src/litserve/mcp.py +++ b/src/litserve/mcp.py @@ -248,17 +248,87 @@ class ToolEndpointType(types.Tool): class MCP: - """MCP is a spec that can be used to create MCP tools for LitServe endpoints. It doesn't affect LitAPI. + """Enable Model Context Protocol (MCP) integration for LitServe APIs. - Example: - >>> api = LitAPI(mcp=MCP(description="A simple API", input_schema={"name": "string"})) + This enables LitServe APIs to be seamlessly integrated into MCP-compatible AI systems, + making models accessible as tools within larger AI workflows and agent frameworks. + Quick Start: + ```python + from pydantic import BaseModel + from litserve.mcp import MCP + import litserve as ls - Spec vs MCP: - - specs (like the OpenAI spec) affects the API endpoint, the request-response format, and the LitAPI methods. - - MCP, on the other hand, works differently. It doesn't follow the OpenAI spec. Instead, it only uses metadata like the name and description to generate an additional endpoint via MCPServer. + class PowerRequest(BaseModel): + input: float - """ # noqa: E501 + class MyLitAPI(ls.test_examples.SimpleLitAPI): + def decode_request(self, request: PowerRequest) -> int: + return request.input + + if __name__ == "__main__": + mcp=MCP(description="Returns the power of a number.") + api = MyLitAPI(mcp=mcp) + server = ls.LitServer(api) + server.run() + ``` + + Args: + name: + Tool name for MCP registration. Defaults to None (uses api_path). + + - Should be descriptive and unique within the MCP server + - Automatically converts "/" to "_" for compatibility + - Used by AI systems to identify and call the tool + + description: + Human-readable description of what the tool does. Defaults to None (uses class docstring). + + - Essential for AI systems to understand when to use the tool + - Should clearly explain the purpose and capabilities + - Used in tool selection and orchestration + + input_schema: + JSON Schema defining expected input format. Defaults to None (auto-extracted). + + - Describes the structure and types of input parameters + - Helps AI systems format requests correctly + - Auto-extracted from decode_request method if not provided + + Schema Auto-Extraction: + If no input_schema is provided, MCP automatically extracts it from type hints in the decode_request method: + + ```python + from pydantic import BaseModel + + class Request(BaseModel): + input: str + + class AutoSchemaAPI(ls.LitAPI): + def decode_request(self, request: Request)->str: + # MCP analyzes the type hints to generate schema: + # input: str -> {"input": {"type": "string", "title": "Input"}} + return request.input + ``` + + Supported type annotations: + - Basic types: `str`, `int`, `float`, `bool`, `list`, `dict` + - Optional types: `Optional[str]`, `Union[str, None]` + - Pydantic models: Full schema extraction with validation + - Complex types: `List[str]`, `Dict[str, Any]` + + Notes: + - MCP integration is optional and doesn't affect non-MCP clients + - Tool names are automatically sanitized (/ becomes _) + - Original API endpoints remain unchanged and fully functional + - Compatible with all LitServe features (batching, streaming, etc.) + + See Also: + - Model Context Protocol documentation: https://lightning.ai/docs/litserve/features/mcp + - LitAPI: Base class for API implementation + - LitServer: Server class for hosting APIs + + """ def __init__( self, diff --git a/src/litserve/server.py b/src/litserve/server.py index ab6c9b61..126dca42 100644 --- a/src/litserve/server.py +++ b/src/litserve/server.py @@ -369,6 +369,245 @@ async def handle_request(self, request, request_type) -> StreamingResponse: class LitServer: + """Initialize a LitServer for high-performance AI model serving. + + LitServer transforms AI models into production-ready APIs with automatic scaling, + batching, streaming, and multi-GPU support. + + Quick Start: + ```python + import litserve as ls + + # Define inference pipeline + class MyAPI(ls.LitAPI): + def setup(self, device): + self.model = load_model() # model loading logic + + def predict(self, x): + return self.model(x) + + # Create and run server + server = ls.LitServer(MyAPI()) + server.run(port=8000) + ``` + + Args: + lit_api: + The core component - one or more LitAPI instances defining model logic. + + - Single API: `MyAPI()` for serving one model + - Multiple APIs: `[API1(), API2()]` for multi-model serving + + Each LitAPI must implement: + - `setup(device)`: Initialize the model + - `predict(x)`: Run inference + - Optional: `decode_request()`, `encode_response()` for custom I/O + + Hardware Configuration: + accelerator: + Hardware type for inference. Defaults to "auto". + + - "auto": Automatically detects best available (CUDA > MPS > CPU) + - "cpu": Force CPU usage + - "cuda": Use NVIDIA GPUs + - "mps": Use Apple Metal Performance Shaders + + devices: + Number of devices to use. Defaults to "auto". + + - "auto": Use all available devices + - int: Use specific number (e.g., 2 for 2 GPUs) + + workers_per_device: + Worker processes per device for parallel inference. Defaults to 1. + + - Higher values = better throughput but more memory usage + - Good starting point: 1-4 depending on model size + - For CPU, set to the number of cores available on the machine (e.g., 8 for 8-core CPU) + - Monitor GPU memory when increasing + + Performance & Scaling: + timeout: + Request timeout in seconds. Defaults to 30. + + - Set to False or -1 to disable timeouts + - Increase for slow models (e.g., 300 for large LLMs) + - Decrease for fast models (e.g., 5 for lightweight models) + + fast_queue: + Enable ZeroMQ for high-throughput scenarios (>100 RPS). Defaults to False. + + - Use when serving hundreds of requests per second + - Not supported on Windows + + track_requests: + Track active requests across all API servers for monitoring and load management. Defaults to False. + + When enabled, tracks the total number of active requests in the queue across all API servers + and makes this count available via callbacks using the `on_request` hook. Essential for + monitoring concurrent request load and implementing custom load management logic. + + - Recommended for production deployments + - Access count via callbacks or `server.active_requests` property + - Useful for monitoring and handling concurrent requests effectively + + API Configuration: + healthcheck_path: + Health check endpoint for load balancers. Defaults to "/health". + + - Returns 200 when all workers are ready + - Critical for Kubernetes/Docker deployments + + info_path: + Server information endpoint. Defaults to "/info". + + - Shows model metadata, device info, server config + - Useful for debugging and monitoring + + shutdown_path: + Graceful shutdown endpoint. Defaults to "/shutdown". + + enable_shutdown_api: + Enable remote shutdown capability. Defaults to False. + + - Requires authentication token (set LIT_SHUTDOWN_API_KEY env var) + - Useful for automated deployment pipelines + + Content & Middleware: + max_payload_size: + Maximum request size. Defaults to "100MB". + + - String format: "10MB", "1GB" + - Integer format: bytes (1048576 for 1MB) + - Increase for large images/videos + + middlewares: + HTTP middleware for cross-cutting concerns. Defaults to None. + + Example: + ```python + from starlette.middleware.cors import CORSMiddleware + + server = LitServer( + api, + middlewares=[ + (CORSMiddleware, {"allow_origins": ["*"]}), + # Add more middleware as needed + ] + ) + ``` + + model_metadata: + Metadata about the model displayed at info endpoint. Defaults to None. + + Example: + ```python + metadata = { + "model_name": "bert-base-uncased", + "version": "1.0.0", + "description": "Text classification model" + } + ``` + + Monitoring & Debugging: + callbacks: + Event handlers for server lifecycle. Defaults to None. + + - Built-in callbacks for logging, metrics, custom logic + - Triggers on request start/end, server start/stop + + loggers: + Custom loggers for metrics and events. Defaults to None. + + - Integrate with monitoring stack + - Track performance metrics, error rates + + Advanced Configuration: + max_batch_size, batch_timeout, spec, stream, api_path, loop: + **Deprecated**: Configure these in LitAPI implementation instead. + + Migration example: + ```python + # Old way (deprecated) + server = LitServer(api, max_batch_size=8, stream=True) + + # New way (recommended) + api = MyAPI(max_batch_size=8, stream=True) + server = LitServer(api) + ``` + + Examples: + Basic Usage: + ```python + import litserve as ls + + class SimpleAPI(ls.LitAPI): + def setup(self, device): + self.model = lambda x: x * 2 # model here + + def predict(self, x): + return self.model(x) + + server = ls.LitServer(SimpleAPI()) + server.run() + ``` + + Production Setup: + ```python + server = ls.LitServer( + MyAPI(max_batch_size=8), + accelerator="cuda", + devices=2, + workers_per_device=4, + fast_queue=True, + track_requests=True, + max_payload_size="50MB", + timeout=60 + ) + server.run(port=8000, num_api_servers=4) + ``` + + Multi-Model Serving: + ```python + # Serve multiple models on different endpoints + text_api = TextClassifierAPI(api_path="/classify") + image_api = ImageClassifierAPI(api_path="/vision") + + server = ls.LitServer([text_api, image_api]) + server.run() + ``` + + Streaming Response: + ```python + class StreamingAPI(ls.LitAPI): + def setup(self, device): + self.model = load_llm() + + def predict(self, prompt): + for token in self.model.generate(prompt): + yield {"token": token} + + server = ls.LitServer(StreamingAPI(stream=True)) + ``` + + Deployment: + Self-hosted: + ```bash + python server.py # Run locally + ``` + + Lightning AI Cloud: + ```bash + lightning deploy server.py --cloud # One-click deploy + ``` + + See Also: + - LitAPI: Base class for defining model logic + - LitSpec: API specifications (OpenAI compatibility) + - Documentation: https://lightning.ai/docs/litserve + + """ + def __init__( self, lit_api: Union[LitAPI, List[LitAPI]], @@ -394,76 +633,6 @@ def __init__( api_path: Optional[str] = None, loop: Optional[Union[str, LitLoop]] = None, ): - """Initialize a LitServer instance for high-performance model inference. - - Args: - lit_api (Union[LitAPI, List[LitAPI]]): - API instance(s) defining model inference logic. Single instance or list for multi-model serving. - - accelerator (str, optional): - Hardware type: 'cpu', 'cuda', 'mps', or 'auto' (detects best available). Defaults to 'auto'. - - devices (Union[int, str], optional): - Number of devices to use, or 'auto' for all available. Defaults to 'auto'. - - workers_per_device (int, optional): - Worker processes per device. Higher values improve throughput but use more memory. Defaults to 1. - - timeout (Union[float, bool], optional): - Request timeout in seconds, or False to disable. Defaults to 30. - - healthcheck_path (str, optional): - Health check endpoint path for load balancers. Defaults to "/health". - - info_path (str, optional): - Server info endpoint path showing metadata and configuration. Defaults to "/info". - - shutdown_path (str, optional): - Server shutdown endpoint path that terminates and cleans up all worker and server processes. - Defaults to "/shutdown". - - enable_shutdown_api (bool, optional): - Enable the shutdown endpoint. If True, the server will listen for shutdown requests - at the specified path. Defaults to False. - - model_metadata (dict, optional): - Model metadata displayed at info endpoint (e.g., {"version": "1.0"}). Defaults to None. - - max_payload_size (Union[int, str], optional): - Maximum request size as bytes or string ("10MB"). Defaults to "100MB". - - track_requests (bool, optional): - Enable request tracking for monitoring. Recommended for production. Defaults to False. - - callbacks (List[Callback], optional): - Callback instances for lifecycle events (logging, metrics). Defaults to None. - - middlewares (List[Middleware], optional): - HTTP middleware for auth, CORS, rate limiting, etc. Defaults to None. - - loggers (List[Logger], optional): - Custom loggers for server activity. Defaults to standard logging. - - fast_queue (bool, optional): - Enable ZeroMQ for high-throughput (>100 RPS). Requires ZeroMQ installation. Defaults to False. - - max_batch_size, batch_timeout, stream, spec, api_path, loop: - **Deprecated**: Configure these in your LitAPI implementation instead. - - Example: - >>> # Basic - >>> server = LitServer(MyLitAPI()) - - >>> # Production - >>> server = LitServer( - ... lit_api=MyLitAPI(max_batch_size=4), - ... accelerator="cuda", - ... devices=2, - ... fast_queue=True, - ... track_requests=True - ... ) - - """ if max_batch_size is not None: warnings.warn( "'max_batch_size' and 'batch_timeout' are being deprecated in `LitServer` " @@ -924,49 +1093,169 @@ def run( pretty_logs: bool = False, **kwargs, ): - """Run the LitServe server to handle API requests and distribute them to inference workers. + """Start the LitServer to serve AI model requests with production-ready performance. + + This method launches the complete serving infrastructure: initializes worker processes, + starts the HTTP server, and begins handling requests. The server runs until manually + stopped (Ctrl+C) or programmatically shut down. + + Quick Start: + ```python + # Basic usage - starts server on localhost:8000 + server.run() + + # Production - multiple servers and custom port + server.run(port=8080, num_api_servers=4) + ``` + + Server Lifecycle: + 1. **Initialize**: Sets up worker processes and communication queues + 2. **Health Check**: Verifies all workers are ready to serve requests + 3. **Start HTTP Server**: Begins accepting requests on specified host/port + 4. **Serve Requests**: Distributes requests to workers and returns responses + 5. **Graceful Shutdown**: Properly terminates workers when stopped Args: - host (str, optional): - Host address to bind to. "0.0.0.0" for all IPs, "127.0.0.1" for localhost only. Defaults to "0.0.0.0". + host: + Network interface to bind the server to. Defaults to "0.0.0.0". + + - "0.0.0.0": Accept connections from any IP (public access) + - "127.0.0.1": Only accept local connections (localhost only) + - "::": IPv6 equivalent of "0.0.0.0" + + For development, use "127.0.0.1" for security. For production/Docker, use "0.0.0.0". + + port: + Port number to listen on. Defaults to 8000. + + - Must be between 1024-65535 (privileged ports require admin) + - Ensure the port is available and not blocked by firewalls + - Common choices: 8000, 8080, 3000, 5000 + + Performance Configuration: + num_api_servers: + Number of parallel HTTP server processes. Defaults to None (auto-detect). - port (Union[str, int], optional): - Port number to bind to. Must be available. Defaults to 8000. + - None: Uses same count as inference workers (recommended) + - Higher values improve HTTP throughput but use more memory + - Good starting point: 2-8 depending on expected load + - Each server handles HTTP requests independently - num_api_servers (Optional[int], optional): - Number of uvicorn server instances for parallel API handling. Higher values improve - throughput but use more resources. Defaults to None (single instance). + api_server_worker_type: + Process architecture for HTTP servers. Defaults to "process". - log_level (str, optional): - Logging level: "critical", "error", "warning", "info", "debug", "trace". - Use "debug" for development. Defaults to "info". + - "process": Better isolation, CPU utilization, and fault tolerance + - "thread": Lower memory usage but shared memory space + - Windows automatically uses "thread" (process forking not supported) - generate_client_file (bool, optional): - Auto-generate Python client file with typed methods for API interaction. Defaults to True. + Development & Debugging: + log_level: + Logging verbosity level. Defaults to "info". - api_server_worker_type (Literal["process", "thread"], optional): - Worker type. "process" for better isolation/CPU usage, "thread" for less memory. Defaults to "process". + - "critical": Only severe errors + - "error": Error conditions + - "warning": Warning messages (good for production) + - "info": General information (default) + - "debug": Detailed debugging info (development) + - "trace": Very verbose output (troubleshooting) - pretty_logs (bool, optional): - Enhanced log formatting with colors using rich library. Good for development. Defaults to False. + pretty_logs: + Enable enhanced log formatting with colors and rich formatting. Defaults to False. + - Requires: `pip install rich` + - Great for development and local debugging + - May not display properly in some production log aggregators + + generate_client_file: + Auto-generate a Python client file for easy API interaction. Defaults to True. + + - Creates `client.py` in current directory with typed methods + - Useful for testing and integration + - Safe to disable in production environments + + Advanced Configuration: **kwargs: - Additional uvicorn server options (ssl_keyfile, ssl_certfile, etc.). See uvicorn docs. - - Example: - >>> server.run() # Basic - - >>> server.run( # Production - ... port=8080, - ... num_api_servers=4, - ... log_level="warning" - ... ) - - >>> server.run( # Development - ... log_level="debug", - ... pretty_logs=True, - ... generate_client_file=True - ... ) + Additional uvicorn server configuration options. + + Common SSL options: + ```python + server.run( + ssl_keyfile="path/to/key.pem", + ssl_certfile="path/to/cert.pem" + ) + ``` + + Other uvicorn options: ssl_ca_certs, ssl_ciphers, ssl_version, + workers, backlog, etc. See uvicorn documentation for full list. + + Examples: + Basic Development: + ```python + # Simple local development + server.run() + # Access at: http://localhost:8000 + # API docs at: http://localhost:8000/docs + ``` + + Production Configuration: + ```python + # High-performance production setup + server.run( + host="0.0.0.0", + port=8000, + num_api_servers=8, + log_level="warning", + pretty_logs=False, + generate_client_file=False + ) + ``` + + Development with Debug: + ```python + # Development with detailed logging + server.run( + host="127.0.0.1", + port=8000, + log_level="debug", + pretty_logs=True, + num_api_servers=1 + ) + ``` + + Multi-API Server: + ```python + # Balance load across multiple HTTP servers + server.run( + port=8000, + num_api_servers=4, # 4 parallel HTTP servers + api_server_worker_type="process" + ) + ``` + + Server Endpoints: + Once running, the server provides several built-in endpoints: + + - **Main API**: `POST /predict` (or custom path from LitAPI) + - **Health Check**: `GET /health` - Returns 200 when ready + - **Server Info**: `GET /info` - Shows configuration and metadata + - **API Documentation**: `GET /docs` - Interactive Swagger UI + - **OpenAPI Schema**: `GET /openapi.json` - API specification + + Stopping the Server: + - **Ctrl+C**: Graceful shutdown (recommended) + - **SIGTERM**: Graceful shutdown in Docker/Kubernetes + - **Shutdown API**: POST to `/shutdown` (if enabled) + + Common Issues: + - **Port in use**: Choose different port or stop conflicting process + - **Permission denied**: Use port > 1024 or run with appropriate permissions + - **Workers not ready**: Check model loading in LitAPI.setup() method + - **Memory issues**: Reduce num_api_servers or workers_per_device + + Notes: + - Server blocks execution until stopped (use threads for non-blocking) + - Logs show startup progress and any configuration issues + - Swagger UI provides interactive API testing interface """ if generate_client_file: