diff --git a/src/litserve/api.py b/src/litserve/api.py
index 28e375e4..d806a860 100644
--- a/src/litserve/api.py
+++ b/src/litserve/api.py
@@ -29,6 +29,104 @@
 
 
 class LitAPI(ABC):
+    """Define inference logic for the model.
+
+    LitAPI is the core abstraction for serving AI models with LitServe. It provides a clean
+    interface for model loading, request processing, and response generation with automatic
+    optimizations like batching, streaming, and async processing.
+
+    Core Workflow:
+        1. **setup()**: Load and initialize the model once per worker
+        2. **decode_request()**: Convert HTTP request to model input format
+        3. **predict()**: Run model inference on the input
+        4. **encode_response()**: Convert model output to HTTP response format
+
+    Quick Start:
+        ```python
+        import litserve as ls
+
+        class MyAPI(ls.LitAPI):
+            def setup(self, device):
+                self.model = lambda x: x**2
+
+            def predict(self, x):
+                return self.model(x["input"])
+
+        server = ls.LitServer(MyAPI())
+        server.run()
+        ```
+
+    Required Methods:
+        setup(device): Initialize the model and resources
+        predict(x): Core inference logic
+
+    Optional Methods:
+        decode_request(request): Transform HTTP requests to model input
+        encode_response(output): Transform model outputs to HTTP responses
+        batch(inputs)/unbatch(outputs): Custom batching logic
+
+    Configuration:
+        max_batch_size: Batch multiple requests for better GPU utilization. Defaults to 1.
+        batch_timeout: Wait time for batch to fill (seconds). Defaults to 0.0.
+        stream: Enable streaming responses for real-time output. Defaults to False.
+        api_path: URL endpoint path. Defaults to "/predict".
+        enable_async: Enable async/await for non-blocking operations. Defaults to False.
+        spec: API specification (e.g., OpenAISpec for OpenAI compatibility). Defaults to None.
+        mcp: Model Context Protocol integration for AI assistants. Defaults to None.
+
+    Examples:
+        Batched GPU Inference:
+        ```python
+        class BatchedAPI(ls.LitAPI):
+            def setup(self, device):
+                self.model = load_model().to(device)
+
+            def predict(self, batch):
+                return self.model(batch)
+
+        api = BatchedAPI(max_batch_size=8, batch_timeout=0.1)
+        ```
+
+        Streaming LLM:
+        ```python
+        class StreamingLLM(ls.LitAPI):
+            def setup(self, device):
+                self.model = load_llm()
+
+            def predict(self, prompt):
+                for token in self.model.generate_stream(prompt):
+                    yield token
+
+        api = StreamingLLM(stream=True)
+        ```
+
+        OpenAI-Compatible:
+        ```python
+        from litserve.specs import OpenAISpec
+
+        class ChatAPI(ls.LitAPI):
+            def setup(self, device):
+                self.model = load_chat_model()
+
+            def predict(self, messages):
+                return self.model.chat(messages)
+
+        api = ChatAPI(spec=OpenAISpec())
+        ```
+
+    Performance Tips:
+        - Use batching for GPU models to maximize utilization
+        - Enable streaming for operations taking >1 second
+        - Use async for I/O-bound operations (databases, external APIs)
+        - Load models in setup(), not __init__
+        - Monitor GPU memory usage with larger batch sizes
+
+    See Also:
+        - LitServer: Server class for hosting APIs
+        - LitSpec: API specifications for standard interfaces
+
+    """
+
     _stream: bool = False
     _default_unbatch: Optional[Callable] = None
     _spec: Optional[LitSpec] = None
@@ -47,60 +145,7 @@ def __init__(
         mcp: Optional["MCP"] = None,
         enable_async: bool = False,
     ):
-        """Initialize a LitAPI instance that defines the model's inference behavior.
-
-        Args:
-            max_batch_size (int, optional):
-                Maximum requests to batch together for inference. Higher values improve throughput
-                for models that benefit from batching but use more memory. Defaults to 1.
-
-            batch_timeout (float, optional):
-                Maximum seconds to wait for a batch to fill before processing incomplete batches.
-                Lower values reduce latency, higher values improve batching efficiency. Defaults to 0.0.
-
-            api_path (str, optional):
-                URL endpoint path for predictions (e.g., "/predict", "/v1/chat"). Defaults to "/predict".
-
-            stream (bool, optional):
-                Enable streaming responses for real-time output (useful for LLMs, long-running tasks).
-                Requires implementing encode_response() for streaming. Defaults to False.
-
-            loop (Union[str, LitLoop], optional):
-                Inference loop strategy. "auto" selects optimal loop based on batching/streaming settings,
-                or provide custom LitLoop instance for advanced control. Defaults to "auto".
-
-            spec (LitSpec, optional):
-                API specification defining input/output schemas and behavior. Use OpenAISpec for
-                OpenAI-compatible APIs or custom LitSpec implementations. Defaults to None.
-
-            mcp (MCP, optional):
-                Enable MCP server for the API. Provide tool description and input schema. Defaults to None.
-
-            enable_async (bool, optional):
-                Enable async/await support for non-blocking operations in predict() method.
-                Useful for I/O-bound inference or external API calls. Defaults to False.
-
-        Example:
-            >>> # Simple API
-            >>> api = LitAPI()
-
-            >>> # Batched inference
-            >>> api = LitAPI(max_batch_size=8, batch_timeout=0.1)
-
-            >>> # OpenAI-compatible API
-            >>> api = LitAPI(spec=OpenAISpec())
-
-            >>> # Async processing
-            >>> api = LitAPI(enable_async=True)
-
-            >>> # MCP server
-            >>> api = LitAPI(mcp=MCP(description="A simple API", input_schema={"name": "string"}))
-
-        Note:
-            You must implement setup(), predict(), and optionally decode_request()/encode_response()
-            methods to define your model's behavior.
-
-        """
+        """Initialize LitAPI with configuration options."""
 
         if max_batch_size <= 0:
             raise ValueError("max_batch_size must be greater than 0")
diff --git a/src/litserve/mcp.py b/src/litserve/mcp.py
index 8e80c66a..99023060 100644
--- a/src/litserve/mcp.py
+++ b/src/litserve/mcp.py
@@ -248,17 +248,87 @@ class ToolEndpointType(types.Tool):
 
 
 class MCP:
-    """MCP is a spec that can be used to create MCP tools for LitServe endpoints. It doesn't affect LitAPI.
+    """Enable Model Context Protocol (MCP) integration for LitServe APIs.
 
-    Example:
-        >>> api = LitAPI(mcp=MCP(description="A simple API", input_schema={"name": "string"}))
+    This enables LitServe APIs to be seamlessly integrated into MCP-compatible AI systems,
+    making models accessible as tools within larger AI workflows and agent frameworks.
 
+    Quick Start:
+        ```python
+        from pydantic import BaseModel
+        from litserve.mcp import MCP
+        import litserve as ls
 
-    Spec vs MCP:
-        - specs (like the OpenAI spec) affects the API endpoint, the request-response format, and the LitAPI methods.
-        - MCP, on the other hand, works differently. It doesn't follow the OpenAI spec. Instead, it only uses metadata like the name and description to generate an additional endpoint via MCPServer.
+        class PowerRequest(BaseModel):
+            input: float
 
-    """  # noqa: E501
+        class MyLitAPI(ls.test_examples.SimpleLitAPI):
+            def decode_request(self, request: PowerRequest) -> int:
+                return request.input
+
+        if __name__ == "__main__":
+            mcp=MCP(description="Returns the power of a number.")
+            api = MyLitAPI(mcp=mcp)
+            server = ls.LitServer(api)
+            server.run()
+        ```
+
+    Args:
+        name:
+            Tool name for MCP registration. Defaults to None (uses api_path).
+
+            - Should be descriptive and unique within the MCP server
+            - Automatically converts "/" to "_" for compatibility
+            - Used by AI systems to identify and call the tool
+
+        description:
+            Human-readable description of what the tool does. Defaults to None (uses class docstring).
+
+            - Essential for AI systems to understand when to use the tool
+            - Should clearly explain the purpose and capabilities
+            - Used in tool selection and orchestration
+
+        input_schema:
+            JSON Schema defining expected input format. Defaults to None (auto-extracted).
+
+            - Describes the structure and types of input parameters
+            - Helps AI systems format requests correctly
+            - Auto-extracted from decode_request method if not provided
+
+    Schema Auto-Extraction:
+        If no input_schema is provided, MCP automatically extracts it from type hints in the decode_request method:
+
+        ```python
+        from pydantic import BaseModel
+
+        class Request(BaseModel):
+            input: str
+
+        class AutoSchemaAPI(ls.LitAPI):
+            def decode_request(self, request: Request)->str:
+                # MCP analyzes the type hints to generate schema:
+                # input: str -> {"input": {"type": "string", "title": "Input"}}
+                return request.input
+        ```
+
+        Supported type annotations:
+        - Basic types: `str`, `int`, `float`, `bool`, `list`, `dict`
+        - Optional types: `Optional[str]`, `Union[str, None]`
+        - Pydantic models: Full schema extraction with validation
+        - Complex types: `List[str]`, `Dict[str, Any]`
+
+    Notes:
+        - MCP integration is optional and doesn't affect non-MCP clients
+        - Tool names are automatically sanitized (/ becomes _)
+        - Original API endpoints remain unchanged and fully functional
+        - Compatible with all LitServe features (batching, streaming, etc.)
+
+    See Also:
+        - Model Context Protocol documentation: https://lightning.ai/docs/litserve/features/mcp
+        - LitAPI: Base class for API implementation
+        - LitServer: Server class for hosting APIs
+
+    """
 
     def __init__(
         self,
diff --git a/src/litserve/server.py b/src/litserve/server.py
index ab6c9b61..126dca42 100644
--- a/src/litserve/server.py
+++ b/src/litserve/server.py
@@ -369,6 +369,245 @@ async def handle_request(self, request, request_type) -> StreamingResponse:
 
 
 class LitServer:
+    """Initialize a LitServer for high-performance AI model serving.
+
+    LitServer transforms AI models into production-ready APIs with automatic scaling,
+    batching, streaming, and multi-GPU support.
+
+    Quick Start:
+        ```python
+        import litserve as ls
+
+        # Define inference pipeline
+        class MyAPI(ls.LitAPI):
+            def setup(self, device):
+                self.model = load_model()  # model loading logic
+
+            def predict(self, x):
+                return self.model(x)
+
+        # Create and run server
+        server = ls.LitServer(MyAPI())
+        server.run(port=8000)
+        ```
+
+    Args:
+        lit_api:
+            The core component - one or more LitAPI instances defining model logic.
+
+            - Single API: `MyAPI()` for serving one model
+            - Multiple APIs: `[API1(), API2()]` for multi-model serving
+
+            Each LitAPI must implement:
+            - `setup(device)`: Initialize the model
+            - `predict(x)`: Run inference
+            - Optional: `decode_request()`, `encode_response()` for custom I/O
+
+    Hardware Configuration:
+        accelerator:
+            Hardware type for inference. Defaults to "auto".
+
+            - "auto": Automatically detects best available (CUDA > MPS > CPU)
+            - "cpu": Force CPU usage
+            - "cuda": Use NVIDIA GPUs
+            - "mps": Use Apple Metal Performance Shaders
+
+        devices:
+            Number of devices to use. Defaults to "auto".
+
+            - "auto": Use all available devices
+            - int: Use specific number (e.g., 2 for 2 GPUs)
+
+        workers_per_device:
+            Worker processes per device for parallel inference. Defaults to 1.
+
+            - Higher values = better throughput but more memory usage
+            - Good starting point: 1-4 depending on model size
+            - For CPU, set to the number of cores available on the machine (e.g., 8 for 8-core CPU)
+            - Monitor GPU memory when increasing
+
+    Performance & Scaling:
+        timeout:
+            Request timeout in seconds. Defaults to 30.
+
+            - Set to False or -1 to disable timeouts
+            - Increase for slow models (e.g., 300 for large LLMs)
+            - Decrease for fast models (e.g., 5 for lightweight models)
+
+        fast_queue:
+            Enable ZeroMQ for high-throughput scenarios (>100 RPS). Defaults to False.
+
+            - Use when serving hundreds of requests per second
+            - Not supported on Windows
+
+        track_requests:
+            Track active requests across all API servers for monitoring and load management. Defaults to False.
+
+            When enabled, tracks the total number of active requests in the queue across all API servers
+            and makes this count available via callbacks using the `on_request` hook. Essential for
+            monitoring concurrent request load and implementing custom load management logic.
+
+            - Recommended for production deployments
+            - Access count via callbacks or `server.active_requests` property
+            - Useful for monitoring and handling concurrent requests effectively
+
+    API Configuration:
+        healthcheck_path:
+            Health check endpoint for load balancers. Defaults to "/health".
+
+            - Returns 200 when all workers are ready
+            - Critical for Kubernetes/Docker deployments
+
+        info_path:
+            Server information endpoint. Defaults to "/info".
+
+            - Shows model metadata, device info, server config
+            - Useful for debugging and monitoring
+
+        shutdown_path:
+            Graceful shutdown endpoint. Defaults to "/shutdown".
+
+        enable_shutdown_api:
+            Enable remote shutdown capability. Defaults to False.
+
+            - Requires authentication token (set LIT_SHUTDOWN_API_KEY env var)
+            - Useful for automated deployment pipelines
+
+    Content & Middleware:
+        max_payload_size:
+            Maximum request size. Defaults to "100MB".
+
+            - String format: "10MB", "1GB"
+            - Integer format: bytes (1048576 for 1MB)
+            - Increase for large images/videos
+
+        middlewares:
+            HTTP middleware for cross-cutting concerns. Defaults to None.
+
+            Example:
+            ```python
+            from starlette.middleware.cors import CORSMiddleware
+
+            server = LitServer(
+                api,
+                middlewares=[
+                    (CORSMiddleware, {"allow_origins": ["*"]}),
+                    # Add more middleware as needed
+                ]
+            )
+            ```
+
+        model_metadata:
+            Metadata about the model displayed at info endpoint. Defaults to None.
+
+            Example:
+            ```python
+            metadata = {
+                "model_name": "bert-base-uncased",
+                "version": "1.0.0",
+                "description": "Text classification model"
+            }
+            ```
+
+    Monitoring & Debugging:
+        callbacks:
+            Event handlers for server lifecycle. Defaults to None.
+
+            - Built-in callbacks for logging, metrics, custom logic
+            - Triggers on request start/end, server start/stop
+
+        loggers:
+            Custom loggers for metrics and events. Defaults to None.
+
+            - Integrate with monitoring stack
+            - Track performance metrics, error rates
+
+    Advanced Configuration:
+        max_batch_size, batch_timeout, spec, stream, api_path, loop:
+            **Deprecated**: Configure these in LitAPI implementation instead.
+
+            Migration example:
+            ```python
+            # Old way (deprecated)
+            server = LitServer(api, max_batch_size=8, stream=True)
+
+            # New way (recommended)
+            api = MyAPI(max_batch_size=8, stream=True)
+            server = LitServer(api)
+            ```
+
+    Examples:
+        Basic Usage:
+        ```python
+        import litserve as ls
+
+        class SimpleAPI(ls.LitAPI):
+            def setup(self, device):
+                self.model = lambda x: x * 2  # model here
+
+            def predict(self, x):
+                return self.model(x)
+
+        server = ls.LitServer(SimpleAPI())
+        server.run()
+        ```
+
+        Production Setup:
+        ```python
+        server = ls.LitServer(
+            MyAPI(max_batch_size=8),
+            accelerator="cuda",
+            devices=2,
+            workers_per_device=4,
+            fast_queue=True,
+            track_requests=True,
+            max_payload_size="50MB",
+            timeout=60
+        )
+        server.run(port=8000, num_api_servers=4)
+        ```
+
+        Multi-Model Serving:
+        ```python
+        # Serve multiple models on different endpoints
+        text_api = TextClassifierAPI(api_path="/classify")
+        image_api = ImageClassifierAPI(api_path="/vision")
+
+        server = ls.LitServer([text_api, image_api])
+        server.run()
+        ```
+
+        Streaming Response:
+        ```python
+        class StreamingAPI(ls.LitAPI):
+            def setup(self, device):
+                self.model = load_llm()
+
+            def predict(self, prompt):
+                for token in self.model.generate(prompt):
+                    yield {"token": token}
+
+        server = ls.LitServer(StreamingAPI(stream=True))
+        ```
+
+    Deployment:
+        Self-hosted:
+        ```bash
+        python server.py  # Run locally
+        ```
+
+        Lightning AI Cloud:
+        ```bash
+        lightning deploy server.py --cloud  # One-click deploy
+        ```
+
+    See Also:
+        - LitAPI: Base class for defining model logic
+        - LitSpec: API specifications (OpenAI compatibility)
+        - Documentation: https://lightning.ai/docs/litserve
+
+    """
+
     def __init__(
         self,
         lit_api: Union[LitAPI, List[LitAPI]],
@@ -394,76 +633,6 @@ def __init__(
         api_path: Optional[str] = None,
         loop: Optional[Union[str, LitLoop]] = None,
     ):
-        """Initialize a LitServer instance for high-performance model inference.
-
-        Args:
-            lit_api (Union[LitAPI, List[LitAPI]]):
-                API instance(s) defining model inference logic. Single instance or list for multi-model serving.
-
-            accelerator (str, optional):
-                Hardware type: 'cpu', 'cuda', 'mps', or 'auto' (detects best available). Defaults to 'auto'.
-
-            devices (Union[int, str], optional):
-                Number of devices to use, or 'auto' for all available. Defaults to 'auto'.
-
-            workers_per_device (int, optional):
-                Worker processes per device. Higher values improve throughput but use more memory. Defaults to 1.
-
-            timeout (Union[float, bool], optional):
-                Request timeout in seconds, or False to disable. Defaults to 30.
-
-            healthcheck_path (str, optional):
-                Health check endpoint path for load balancers. Defaults to "/health".
-
-            info_path (str, optional):
-                Server info endpoint path showing metadata and configuration. Defaults to "/info".
-
-            shutdown_path (str, optional):
-                Server shutdown endpoint path that terminates and cleans up all worker and server processes.
-                Defaults to "/shutdown".
-
-            enable_shutdown_api (bool, optional):
-                Enable the shutdown endpoint. If True, the server will listen for shutdown requests
-                at the specified path. Defaults to False.
-
-            model_metadata (dict, optional):
-                Model metadata displayed at info endpoint (e.g., {"version": "1.0"}). Defaults to None.
-
-            max_payload_size (Union[int, str], optional):
-                Maximum request size as bytes or string ("10MB"). Defaults to "100MB".
-
-            track_requests (bool, optional):
-                Enable request tracking for monitoring. Recommended for production. Defaults to False.
-
-            callbacks (List[Callback], optional):
-                Callback instances for lifecycle events (logging, metrics). Defaults to None.
-
-            middlewares (List[Middleware], optional):
-                HTTP middleware for auth, CORS, rate limiting, etc. Defaults to None.
-
-            loggers (List[Logger], optional):
-                Custom loggers for server activity. Defaults to standard logging.
-
-            fast_queue (bool, optional):
-                Enable ZeroMQ for high-throughput (>100 RPS). Requires ZeroMQ installation. Defaults to False.
-
-            max_batch_size, batch_timeout, stream, spec, api_path, loop:
-                **Deprecated**: Configure these in your LitAPI implementation instead.
-
-        Example:
-            >>> # Basic
-            >>> server = LitServer(MyLitAPI())
-
-            >>> # Production
-            >>> server = LitServer(
-            ...     lit_api=MyLitAPI(max_batch_size=4),
-            ...     accelerator="cuda",
-            ...     devices=2,
-            ...     fast_queue=True,
-            ...     track_requests=True
-            ... )
-
-        """
         if max_batch_size is not None:
             warnings.warn(
                 "'max_batch_size' and 'batch_timeout' are being deprecated in `LitServer` "
@@ -924,49 +1093,169 @@ def run(
         pretty_logs: bool = False,
         **kwargs,
     ):
-        """Run the LitServe server to handle API requests and distribute them to inference workers.
+        """Start the LitServer to serve AI model requests with production-ready performance.
+
+        This method launches the complete serving infrastructure: initializes worker processes,
+        starts the HTTP server, and begins handling requests. The server runs until manually
+        stopped (Ctrl+C) or programmatically shut down.
+
+        Quick Start:
+            ```python
+            # Basic usage - starts server on localhost:8000
+            server.run()
+
+            # Production - multiple servers and custom port
+            server.run(port=8080, num_api_servers=4)
+            ```
+
+        Server Lifecycle:
+            1. **Initialize**: Sets up worker processes and communication queues
+            2. **Health Check**: Verifies all workers are ready to serve requests
+            3. **Start HTTP Server**: Begins accepting requests on specified host/port
+            4. **Serve Requests**: Distributes requests to workers and returns responses
+            5. **Graceful Shutdown**: Properly terminates workers when stopped
 
         Args:
-            host (str, optional):
-                Host address to bind to. "0.0.0.0" for all IPs, "127.0.0.1" for localhost only. Defaults to "0.0.0.0".
+            host:
+                Network interface to bind the server to. Defaults to "0.0.0.0".
+
+                - "0.0.0.0": Accept connections from any IP (public access)
+                - "127.0.0.1": Only accept local connections (localhost only)
+                - "::": IPv6 equivalent of "0.0.0.0"
+
+                For development, use "127.0.0.1" for security. For production/Docker, use "0.0.0.0".
+
+            port:
+                Port number to listen on. Defaults to 8000.
+
+                - Must be between 1024-65535 (privileged ports require admin)
+                - Ensure the port is available and not blocked by firewalls
+                - Common choices: 8000, 8080, 3000, 5000
+
+        Performance Configuration:
+            num_api_servers:
+                Number of parallel HTTP server processes. Defaults to None (auto-detect).
 
-            port (Union[str, int], optional):
-                Port number to bind to. Must be available. Defaults to 8000.
+                - None: Uses same count as inference workers (recommended)
+                - Higher values improve HTTP throughput but use more memory
+                - Good starting point: 2-8 depending on expected load
+                - Each server handles HTTP requests independently
 
-            num_api_servers (Optional[int], optional):
-                Number of uvicorn server instances for parallel API handling. Higher values improve
-                throughput but use more resources. Defaults to None (single instance).
+            api_server_worker_type:
+                Process architecture for HTTP servers. Defaults to "process".
 
-            log_level (str, optional):
-                Logging level: "critical", "error", "warning", "info", "debug", "trace".
-                Use "debug" for development. Defaults to "info".
+                - "process": Better isolation, CPU utilization, and fault tolerance
+                - "thread": Lower memory usage but shared memory space
+                - Windows automatically uses "thread" (process forking not supported)
 
-            generate_client_file (bool, optional):
-                Auto-generate Python client file with typed methods for API interaction. Defaults to True.
+        Development & Debugging:
+            log_level:
+                Logging verbosity level. Defaults to "info".
 
-            api_server_worker_type (Literal["process", "thread"], optional):
-                Worker type. "process" for better isolation/CPU usage, "thread" for less memory. Defaults to "process".
+                - "critical": Only severe errors
+                - "error": Error conditions
+                - "warning": Warning messages (good for production)
+                - "info": General information (default)
+                - "debug": Detailed debugging info (development)
+                - "trace": Very verbose output (troubleshooting)
 
-            pretty_logs (bool, optional):
-                Enhanced log formatting with colors using rich library. Good for development. Defaults to False.
+            pretty_logs:
+                Enable enhanced log formatting with colors and rich formatting. Defaults to False.
 
+                - Requires: `pip install rich`
+                - Great for development and local debugging
+                - May not display properly in some production log aggregators
+
+            generate_client_file:
+                Auto-generate a Python client file for easy API interaction. Defaults to True.
+
+                - Creates `client.py` in current directory with typed methods
+                - Useful for testing and integration
+                - Safe to disable in production environments
+
+        Advanced Configuration:
             **kwargs:
-                Additional uvicorn server options (ssl_keyfile, ssl_certfile, etc.). See uvicorn docs.
-
-        Example:
-        >>> server.run()  # Basic
-
-        >>> server.run(  # Production
-        ...     port=8080,
-        ...     num_api_servers=4,
-        ...     log_level="warning"
-        ... )
-
-        >>> server.run(  # Development
-        ...     log_level="debug",
-        ...     pretty_logs=True,
-        ...     generate_client_file=True
-        ... )
+                Additional uvicorn server configuration options.
+
+                Common SSL options:
+                ```python
+                server.run(
+                    ssl_keyfile="path/to/key.pem",
+                    ssl_certfile="path/to/cert.pem"
+                )
+                ```
+
+                Other uvicorn options: ssl_ca_certs, ssl_ciphers, ssl_version,
+                workers, backlog, etc. See uvicorn documentation for full list.
+
+        Examples:
+            Basic Development:
+            ```python
+            # Simple local development
+            server.run()
+            # Access at: http://localhost:8000
+            # API docs at: http://localhost:8000/docs
+            ```
+
+            Production Configuration:
+            ```python
+            # High-performance production setup
+            server.run(
+                host="0.0.0.0",
+                port=8000,
+                num_api_servers=8,
+                log_level="warning",
+                pretty_logs=False,
+                generate_client_file=False
+            )
+            ```
+
+            Development with Debug:
+            ```python
+            # Development with detailed logging
+            server.run(
+                host="127.0.0.1",
+                port=8000,
+                log_level="debug",
+                pretty_logs=True,
+                num_api_servers=1
+            )
+            ```
+
+            Multi-API Server:
+            ```python
+            # Balance load across multiple HTTP servers
+            server.run(
+                port=8000,
+                num_api_servers=4,  # 4 parallel HTTP servers
+                api_server_worker_type="process"
+            )
+            ```
+
+        Server Endpoints:
+            Once running, the server provides several built-in endpoints:
+
+            - **Main API**: `POST /predict` (or custom path from LitAPI)
+            - **Health Check**: `GET /health` - Returns 200 when ready
+            - **Server Info**: `GET /info` - Shows configuration and metadata
+            - **API Documentation**: `GET /docs` - Interactive Swagger UI
+            - **OpenAPI Schema**: `GET /openapi.json` - API specification
+
+        Stopping the Server:
+            - **Ctrl+C**: Graceful shutdown (recommended)
+            - **SIGTERM**: Graceful shutdown in Docker/Kubernetes
+            - **Shutdown API**: POST to `/shutdown` (if enabled)
+
+        Common Issues:
+            - **Port in use**: Choose different port or stop conflicting process
+            - **Permission denied**: Use port > 1024 or run with appropriate permissions
+            - **Workers not ready**: Check model loading in LitAPI.setup() method
+            - **Memory issues**: Reduce num_api_servers or workers_per_device
+
+        Notes:
+            - Server blocks execution until stopped (use threads for non-blocking)
+            - Logs show startup progress and any configuration issues
+            - Swagger UI provides interactive API testing interface
 
         """
         if generate_client_file: