diff --git a/Dockerfile b/Dockerfile
index ef9981b..03ba58d 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -127,7 +127,7 @@ echo ""
 echo "================================================"
 
 # Start server in background
-openarc serve start --host 0.0.0.0 --openarc-port 8000 &
+openarc serve start --host 0.0.0.0 --port 8000 &
 SERVER_PID=$!
 
 # Auto-load model if specified
diff --git a/README.md b/README.md
index ca97c95..fa674dd 100644
--- a/README.md
+++ b/README.md
@@ -351,14 +351,19 @@ openarc add --model-name <model-name> --model-path <path/to/model> --engine ovge
 
 Reads added configurations from `openarc_config.json`.
 
-Display all saved configurations:
+Display all added models:
 ```
 openarc list
 ```
 
+Display config metadata for a specific model:
+```
+openarc list <model-name> -v
+```
+
 Remove a configuration:
 ```
-openarc list --remove --model-name <model-name>
+openarc list --remove <model-name>
 ```
 
 </details>
@@ -378,7 +383,7 @@ openarc serve start # defauls to 0.0.0.0:8000
 Configure host and port
 
 ```
-openarc serve start --host --openarc-port
+openarc serve start --host --port
 ```
 
 To load models on startup:
diff --git a/examples/openvino_genai/ov_genai_AutoTokenizer.py b/examples/openvino_genai/ov_genai_AutoTokenizer.py
index f71e86b..e6c53f3 100644
--- a/examples/openvino_genai/ov_genai_AutoTokenizer.py
+++ b/examples/openvino_genai/ov_genai_AutoTokenizer.py
@@ -2,17 +2,17 @@
 from openvino_genai import GenerationConfig, LLMPipeline
 from transformers import AutoTokenizer
 
-model_dir = "/mnt/Ironwolf-4TB/Models/OpenVINO/Qwen/Qwen3-REAP-25B-A3B-int4_asym-ov"
+model_dir = "/mnt/Ironwolf-4TB/Models/OpenVINO/DeepSeek-V2-Lite-Chat-int4_asym-ov"
 
 pipe = LLMPipeline(
     model_dir,       # Path to the model directory. Remember this will not pull from hub like in transformers
-    device="GPU.0"
+    device="CPU"
 
 )
 tokenizer = AutoTokenizer.from_pretrained(model_dir)
 
 generation_config = GenerationConfig(
-    max_new_tokens=128
+    max_new_tokens=24
 )
 
 prompt = "You're the fastest Llama this side of the equator. What's your favorite food? try to imagine"
diff --git a/pyproject.toml b/pyproject.toml
index ca7b88d..d730e87 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -45,7 +45,7 @@ build-backend = "setuptools.build_meta"
 packages = ["src"]
 
 [project.scripts]
-openarc = "src.cli.openarc_cli:cli"
+openarc = "src.cli:cli"
 
 [tool.uv]
 dev-dependencies = []
diff --git a/src/cli/__init__.py b/src/cli/__init__.py
index e69de29..f04d9fb 100644
--- a/src/cli/__init__.py
+++ b/src/cli/__init__.py
@@ -0,0 +1,6 @@
+"""
+OpenArc CLI - Command-line interface for OpenArc server operations.
+"""
+from .main import cli
+
+__all__ = ['cli']
diff --git a/src/cli/groups/__init__.py b/src/cli/groups/__init__.py
new file mode 100644
index 0000000..90d8197
--- /dev/null
+++ b/src/cli/groups/__init__.py
@@ -0,0 +1,3 @@
+"""
+Command groups for OpenArc CLI.
+"""
diff --git a/src/cli/groups/add.py b/src/cli/groups/add.py
new file mode 100644
index 0000000..c7ee458
--- /dev/null
+++ b/src/cli/groups/add.py
@@ -0,0 +1,102 @@
+"""
+Add command - Add a model configuration to the config file.
+"""
+import json
+
+import click
+
+from ..main import cli, console
+from ..utils import validate_model_path
+
+
+@cli.command()
+@click.option('--model-name', '--mn',
+    required=True,
+    help='Public facing name of the model.')
+@click.option('--model-path', '--m',
+    required=True, 
+    help='Path to OpenVINO IR converted model.')
+@click.option('--engine', '--en',
+    type=click.Choice(['ovgenai', 'openvino', 'optimum']),
+    required=True,
+    help='Engine used to load the model (ovgenai, openvino, optimum)')
+@click.option('--model-type', '--mt',
+    type=click.Choice(['llm', 'vlm', 'whisper', 'kokoro', 'emb', 'rerank']),
+    required=True,
+    help='Model type (llm, vlm, whisper, kokoro, emb, rerank)')
+@click.option('--device', '--d',
+    required=True,
+    help='Device(s) to load the model on.')
+@click.option("--runtime-config", "--rtc",
+    default=None,
+    help='OpenVINO runtime configuration as JSON string (e.g., \'{"MODEL_DISTRIBUTION_POLICY": "PIPELINE_PARALLEL"}\').')
+@click.option('--vlm-type', '--vt',
+    type=click.Choice(['internvl2', 'llava15', 'llavanext', 'minicpmv26', 'phi3vision', 'phi4mm', 'qwen2vl', 'qwen25vl', 'gemma3']),
+    required=False,
+    default=None,
+    help='Vision model type. Used to map correct vision tokens.')
+@click.option('--draft-model-path', '--dmp',
+    required=False,
+    default=None,
+    help='Path to draft model for speculative decoding.')
+@click.option('--draft-device', '--dd',
+    required=False,
+    default=None,
+    help='Draft model device.')
+@click.option('--num-assistant-tokens', '--nat',
+    required=False,
+    default=None,
+    type=int,
+    help='Number of tokens draft model generates per step.')
+@click.option('--assistant-confidence-threshold', '--act',
+    required=False,
+    default=None,
+    type=float,
+    help='Confidence threshold for accepting draft tokens.')
+@click.pass_context
+def add(ctx, model_path, model_name, engine, model_type, device, runtime_config, vlm_type, draft_model_path, draft_device, num_assistant_tokens, assistant_confidence_threshold):
+    """- Add a model configuration to the config file."""
+    
+    # Validate model path
+    if not validate_model_path(model_path):
+        console.print(f"[red]Model file check failed! {model_path} does not contain openvino model files OR your chosen path is malformed. Verify chosen path is correct and acquired model files match source on the hub, or the destination of converted model.[/red]")
+        ctx.exit(1)
+    
+    # Parse runtime_config if provided
+    parsed_runtime_config = {}
+    if runtime_config:
+        try:
+            parsed_runtime_config = json.loads(runtime_config)
+            if not isinstance(parsed_runtime_config, dict):
+                console.print(f"[red]Error: runtime_config must be a JSON object (dictionary), got {type(parsed_runtime_config).__name__}[/red]")
+                console.print('[yellow]Example format: \'{"MODEL_DISTRIBUTION_POLICY": "PIPELINE_PARALLEL"}\'[/yellow]')
+                ctx.exit(1)
+        except json.JSONDecodeError as e:
+            console.print(f"[red]Error parsing runtime_config JSON:[/red] {e}")
+            console.print('[yellow]Example format: \'{"MODEL_DISTRIBUTION_POLICY": "PIPELINE_PARALLEL"}\'[/yellow]')
+            ctx.exit(1)
+    
+    # Build and save configuration
+    load_config = {
+        "model_name": model_name,
+        "model_path": model_path,  
+        "model_type": model_type,  
+        "engine": engine,    
+        "device": device,
+        "runtime_config": parsed_runtime_config,
+        "vlm_type": vlm_type if vlm_type else None
+    }
+    
+    # Add speculative decoding options if provided
+    if draft_model_path:
+        load_config["draft_model_path"] = draft_model_path
+    if draft_device:
+        load_config["draft_device"] = draft_device
+    if num_assistant_tokens is not None:
+        load_config["num_assistant_tokens"] = num_assistant_tokens
+    if assistant_confidence_threshold is not None:
+        load_config["assistant_confidence_threshold"] = assistant_confidence_threshold
+    
+    ctx.obj.server_config.save_model_config(model_name, load_config)
+    console.print(f"[green]Model configuration saved:[/green] {model_name}")
+    console.print(f"[dim]Use 'openarc load {model_name}' to load this model.[/dim]")
diff --git a/src/cli/groups/bench.py b/src/cli/groups/bench.py
new file mode 100644
index 0000000..e9e612d
--- /dev/null
+++ b/src/cli/groups/bench.py
@@ -0,0 +1,223 @@
+"""
+Bench command - Benchmark inference with pseudo-random input tokens.
+"""
+import uuid
+from pathlib import Path
+
+import click
+import requests
+from rich.progress import Progress, SpinnerColumn, TextColumn
+from rich.table import Table
+
+from ..main import cli, console
+from ..utils import validate_model_path
+
+
+@cli.command()
+@click.argument('model_name')
+@click.option('--input_tokens', '--p', multiple=True, default=['512'],
+              help='Number of prompt tokens. Can be comma-separated (e.g., --p 16,32) or specified multiple times (e.g., -p 16 -p 32). Default: 512')
+@click.option('--max_tokens', '--n', multiple=True, default=['128'],
+              help='Number of tokens to generate. Can be comma-separated or specified multiple times. Default: 128')
+@click.option('--runs', '--r', default=5, type=int,
+              help='Number of times to repeat each benchmark. Default: 5')
+@click.option('--temperature', '--temp', default=None, type=float,
+              help='Sampling temperature (default: 1.0)')
+@click.option('--top-k', '--k', default=None, type=int,
+              help='Top-k sampling (default: 50)')
+@click.option('--top-p', '--p-nucleus', default=None, type=float,
+              help='Top-p nucleus sampling (default: 1.0)')
+@click.option('--repetition-penalty', '--rep', default=None, type=float,
+              help='Repetition penalty (default: 1.0)')
+@click.pass_context
+def bench(ctx, model_name, input_tokens, max_tokens, runs, temperature, top_k, top_p, repetition_penalty):
+    """- Benchmark inference with pseudo-random input tokens.
+    
+    Examples:
+        openarc bench Dolphin-X1
+        openarc bench Dolphin-X1 --p 512 --n 128 -r 10
+        openarc bench Dolphin-X1 --p 16,32,64 --n 128,256
+        openarc bench Dolphin-X1 -p 16 -p 32 -n 128 -n 256
+    """
+    from ..modules.benchmark import OpenArcBenchmarks
+    from ..main import OpenArcCLI
+    
+    cli_instance = OpenArcCLI(server_config=ctx.obj.server_config)
+    
+    # Parse input_tokens and max_tokens (handle comma-separated and multiple invocations)
+    p_values = []
+    for pt in input_tokens:
+        p_values.extend([int(x.strip()) for x in pt.split(',')])
+    
+    n_values = []
+    for nt in max_tokens:
+        n_values.extend([int(x.strip()) for x in nt.split(',')])
+    
+    # Check if model exists
+    try:
+        console.print("[cyan]working...[/cyan]\n")
+        models_url = f"{cli_instance.base_url}/v1/models"
+        models_response = requests.get(models_url, headers=cli_instance.get_headers())
+        
+        if models_response.status_code != 200:
+            console.print(f"[red]Failed to get model list: {models_response.status_code}[/red]")
+            ctx.exit(1)
+        
+        models_data = models_response.json()
+        available_models = [m['id'] for m in models_data.get('data', [])]
+        
+        if model_name not in available_models:
+            console.print(f"[red]'{model_name}' not found in loaded models[/red]")
+            console.print(f"[yellow]Available models: {', '.join(available_models)}[/yellow]")
+            console.print("[dim]Use 'openarc status' to see loaded models.[/dim]")
+            ctx.exit(1)
+        
+        
+    except requests.exceptions.RequestException as e:
+        console.print(f"[red]Request failed:[/red] {e}")
+        ctx.exit(1)
+    
+    # Get model path from config to generate input tokens
+    model_config = ctx.obj.server_config.get_model_config(model_name)
+    if not model_config:
+        console.print(f"[red]Model configuration not found for '{model_name}'[/red]")
+        console.print("[yellow]Cannot generate random tokens without model path.[/yellow]")
+        console.print("[blue]Use 'openarc list' to see saved configurations.[/blue]")
+        ctx.exit(1)
+    
+    model_path = model_config.get('model_path')
+    if not model_path:
+        console.print("[red]model_path not found in configuration[/red]")
+        ctx.exit(1)
+    
+    # Validate model path
+    if not validate_model_path(model_path):
+        console.print(f"[red]Model file check failed! {model_path} does not contain openvino model files OR your chosen path is malformed. Verify chosen path is correct and acquired model files match source on the hub, or the destination of converted model.[/red]")
+        ctx.exit(1)
+    
+    # Run benchmarks
+    console.print(f"input tokens: {p_values}")
+    console.print(f"max tokens:   {n_values}")
+    console.print(f"runs: {runs}\n")
+    
+    # Generate unique run_id for this benchmark session
+    run_id = str(uuid.uuid4())
+    
+    total_runs = len(p_values) * len(n_values) * runs
+    results = []
+    
+    with Progress(
+        SpinnerColumn(),
+        TextColumn("[progress.description]{task.description}"),
+        console=console
+    ) as progress:
+        task = progress.add_task(f"Running... (0/{total_runs})", total=total_runs)
+        
+        run_count = 0
+        for p in p_values:
+            for n in n_values:
+                for r in range(runs):
+                    run_count += 1
+                    progress.update(task, description=f"[dim]benching...[/dim] ({run_count}/{total_runs}) [p={p}, n={n}, r={r+1}/{runs}]")
+                    
+                    try:
+                        # Generate random input tokens
+                        input_ids = OpenArcBenchmarks.random_input_ids(model_path, p)
+                        
+                        # Make benchmark request
+                        bench_url = f"{cli_instance.base_url}/openarc/bench"
+                        payload = {
+                            "model": model_name,
+                            "input_ids": input_ids,
+                            "max_tokens": n
+                        }
+
+                        # Add optional parameters if provided
+                        if temperature is not None:
+                            payload["temperature"] = temperature
+                        if top_k is not None:
+                            payload["top_k"] = top_k
+                        if top_p is not None:
+                            payload["top_p"] = top_p
+                        if repetition_penalty is not None:
+                            payload["repetition_penalty"] = repetition_penalty
+
+                        bench_response = requests.post(
+                            bench_url,
+                            headers=cli_instance.get_headers(),
+                            json=payload
+                        )
+                        
+                        if bench_response.status_code != 200:
+                            console.print(f"\n[red]Benchmark request failed: {bench_response.status_code}[/red]")
+                            console.print(f"[red]Response:[/red] {bench_response.text}")
+                            continue
+                        
+                        metrics = bench_response.json().get('metrics', {})
+                        
+                        # Store individual result
+                        result = {
+                            'p': p,
+                            'n': n,
+                            'run': r + 1,
+                            'ttft': metrics.get('ttft (s)', 0),
+                            'tpot': metrics.get('tpot (ms)', 0),
+                            'prefill_throughput': metrics.get('prefill_throughput (tokens/s)', 0),
+                            'decode_throughput': metrics.get('decode_throughput (tokens/s)', 0),
+                            'decode_duration': metrics.get('decode_duration (s)', 0),
+                            'input_token': metrics.get('input_token', 0),
+                            'new_token': metrics.get('new_token', 0),
+                            'total_token': metrics.get('total_token', 0),
+                        }
+                        results.append(result)
+                        
+                        # Save result to database
+                        ctx.obj.benchmark_db.save_result(model_name, result, run_id)
+                        
+                    except Exception as e:
+                        console.print(f"\n[yellow]Error in run {r+1}: {e}[/yellow]")
+                        continue
+                    
+                    progress.advance(task)
+    
+    # Display results
+    console.print("\n")
+    
+    if not results:
+        console.print("[red]No benchmark results collected![/red]")
+        ctx.exit(1)
+    
+    
+    
+    
+    model_path_name = Path(model_path).name
+    console.print(f"\n[blue]{model_path_name}[/blue]\n")
+
+    # Create results table with visible lines
+    results_table = Table(show_header=True, header_style="bold")
+    results_table.add_column("[cyan]run[/cyan]", justify="right")
+    results_table.add_column("[cyan]p[/cyan]", justify="right")
+    results_table.add_column("[cyan]n[/cyan]", justify="right")
+    results_table.add_column("[cyan]ttft(s)[/cyan]", justify="right")
+    results_table.add_column("[cyan]tpot(ms)[/cyan]", justify="right")
+    results_table.add_column("[cyan]prefill(t/s)[/cyan]", justify="right")
+    results_table.add_column("[cyan]decode(t/s)[/cyan]", justify="right")
+    results_table.add_column("[cyan]duration(s)[/cyan]", justify="right")
+
+    
+    for result in results:
+        results_table.add_row(
+            str(result['run']),
+            str(result['p']),
+            str(result['n']),
+            f"{result['ttft']:.2f}",
+            f"{result['tpot']:.2f}",
+            f"{result['prefill_throughput']:.1f}",
+            f"{result['decode_throughput']:.1f}",
+            f"{result['decode_duration']:.2f}"
+            )
+    
+    console.print(results_table)
+    
+
+    console.print(f"[dim]Total: {len(results)} runs[/dim]")
diff --git a/src/cli/groups/list.py b/src/cli/groups/list.py
new file mode 100644
index 0000000..66d4769
--- /dev/null
+++ b/src/cli/groups/list.py
@@ -0,0 +1,102 @@
+"""
+List command - List saved model configurations.
+"""
+import click
+from rich.panel import Panel
+from rich.table import Table
+from rich.text import Text
+
+from ..main import cli, console
+
+
+@cli.command("list")
+@click.argument('model_name', required=False)
+@click.option('--v', 'verbose', is_flag=True, help='Show config for model_name.')
+@click.option('--rm', 'remove', is_flag=True, help='Remove a model from config.')
+@click.pass_context
+def list(ctx, model_name, verbose, remove):
+    """- List saved model configurations.
+       
+       - Remove a model configuration.
+    
+    Examples:
+        openarc list                    # List all model names
+        openarc list model_name --v     # Show metadata for specific model
+        openarc list model_name --rm    # Remove a model configuration
+    """
+    if remove:
+        if not model_name:
+            console.print("[red]Error:[/red] model_name is required when using --remove")
+            ctx.exit(1)
+        
+        # Check if model exists before trying to remove
+        if not ctx.obj.server_config.model_exists(model_name):
+            console.print(f"{model_name} [red]not found[/red]")
+            console.print("[yellow]Use 'openarc list' to see available configurations.[/yellow]")
+            ctx.exit(1)
+        
+        # Remove the configuration
+        if ctx.obj.server_config.remove_model_config(model_name):
+            console.print(f"[green]Model configuration removed:[/green] {model_name}")
+        else:
+            console.print(f"[red]Failed to remove model configuration:[/red] {model_name}")
+            ctx.exit(1)
+        return
+    
+    models = ctx.obj.server_config.get_all_models()
+    
+    if not models:
+        console.print("[yellow]No model configurations found.[/yellow]")
+        console.print("[dim]Use 'openarc add --help' to see how to save configurations.[/dim]")
+        return
+    
+    # Case 1: Show metadata for specific model with -v flag
+    if model_name and verbose:
+        if model_name not in models:
+            console.print(f"[red]Model not found:[/red] {model_name}")
+            console.print("[yellow]Use 'openarc list' to see available configurations.[/yellow]")
+            ctx.exit(1)
+        
+        model_config = models[model_name]
+        
+        # Create a table for the model configuration
+        config_table = Table(show_header=False, box=None, pad_edge=False)
+        
+        config_table.add_row("model_name", f"[cyan]{model_name}[/cyan]")
+        config_table.add_row("model_path", f"[yellow]{model_config.get('model_path')}[/yellow]")
+        config_table.add_row("device", f"[blue]{model_config.get('device')}[/blue]")
+        config_table.add_row("engine", f"[green]{model_config.get('engine')}[/green]")
+        config_table.add_row("model_type", f"[magenta]{model_config.get('model_type')}[/magenta]")
+
+        # Display optional fields when available
+        if model_config.get('draft_model_path'):
+            config_table.add_row("draft_model_path", f"[red]{model_config.get('draft_model_path')}[/red]")
+        if model_config.get('draft_device'):
+            config_table.add_row("draft_device", f"[red]{model_config.get('draft_device')}[/red]")
+        if model_config.get('num_assistant_tokens') is not None:
+            config_table.add_row("num_assistant_tokens", f"[red]{model_config.get('num_assistant_tokens')}[/red]")
+        if model_config.get('assistant_confidence_threshold') is not None:
+            config_table.add_row("assistant_confidence_threshold", f"[red]{model_config.get('assistant_confidence_threshold')}[/red]")
+
+        rtc = model_config.get('runtime_config', {})
+        if rtc:
+            config_table.add_row("", "")
+            config_table.add_row(Text("runtime_config", style="bold underline yellow"), "")
+            for key, value in rtc.items():
+                config_table.add_row(f"  {key}", f"[dim]{value}[/dim]")
+        
+        panel = Panel(
+            config_table,
+            border_style="green"
+        )
+        console.print(panel)
+        return
+    
+    # Case 2: Show only model names (default behavior)
+    console.print(f"[blue]Saved Model Configurations ({len(models)}):[/blue]\n")
+    
+    for name in models.keys():
+        console.print(f"  [cyan]{name}[/cyan]")
+    
+    console.print("[dim]Use 'openarc list <model_name> --v' to see model metadata.[/dim]")
+    console.print("[dim]Use 'openarc list <model_name> --rm' to remove a configuration.[/dim]")
diff --git a/src/cli/groups/load.py b/src/cli/groups/load.py
new file mode 100644
index 0000000..f2016e7
--- /dev/null
+++ b/src/cli/groups/load.py
@@ -0,0 +1,94 @@
+"""
+Load command - Load one or more models from saved configuration.
+"""
+import click
+import requests
+
+from ..main import OpenArcCLI, cli, console
+from ..utils import validate_model_path
+
+
+@cli.command()
+@click.argument('model_names', nargs=-1, required=True)
+@click.pass_context
+def load(ctx, model_names):
+    """- Load one or more models from saved configuration.
+    
+    Examples:
+        openarc load model_name
+        openarc load Dolphin-X1 kokoro whisper
+    """
+    cli_instance = OpenArcCLI(server_config=ctx.obj.server_config)
+    
+    model_names = list(model_names)
+    
+    # Track results
+    successful_loads = []
+    failed_loads = []
+    
+    # Start loading queue
+    if len(model_names) > 1:
+        console.print(f"[blue]Starting load queue...[/blue] ({len(model_names)} models)\n")
+    
+    # Load each model
+    for idx, name in enumerate(model_names, 1):
+        # Show progress indicator for multiple models
+        if len(model_names) > 1:
+            console.print(f"[cyan]({idx}/{len(model_names)})[/cyan] [blue]loading[/blue] {name}")
+        else:
+            console.print(f"[blue]loading[/blue] {name}")
+        
+        # Get saved configuration
+        saved_config = ctx.obj.server_config.get_model_config(name)
+        
+        if not saved_config:
+            console.print(f"[red]Model configuration not found:[/red] {name}")
+            console.print("[yellow]Tip: Use 'openarc list' to see saved configurations.[/yellow]\n")
+            failed_loads.append(name)
+            continue
+        
+        load_config = saved_config.copy()
+        
+        # Validate model path
+        model_path = load_config.get('model_path')
+        if model_path and not validate_model_path(model_path):
+            console.print(f"[red]Model file check failed! {model_path} does not contain openvino model files OR your chosen path is malformed. Verify chosen path is correct and acquired model files match source on the hub, or the destination of converted model.[/red]")
+            failed_loads.append(name)
+            continue
+        
+        # Make API request to load the model
+        url = f"{cli_instance.base_url}/openarc/load"
+        
+        try:
+            console.print("[cyan]...working[/cyan]")
+            response = requests.post(url, json=load_config, headers=cli_instance.get_headers())
+            
+            if response.status_code == 200:
+                console.print(f"[green]{name} loaded![/green]\n")
+                successful_loads.append(name)
+            else:
+                console.print(f"[red]error: {response.status_code}[/red]")
+                console.print(f"[red]Response:[/red] {response.text}\n")
+                failed_loads.append(name)
+                
+        except requests.exceptions.RequestException as e:
+            console.print(f"[red]Request failed:[/red] {e}\n")
+            failed_loads.append(name)
+    
+    # Summary
+    console.print("─" * 60)
+    if successful_loads and not failed_loads:
+        console.print(f"[green]All models loaded![/green] ({len(successful_loads)}/{len(model_names)})")
+    elif successful_loads and failed_loads:
+        console.print(f"[yellow]Partial success:[/yellow] {len(successful_loads)}/{len(model_names)} models loaded")
+        console.print(f"   [green]✓ Loaded:[/green] {', '.join(successful_loads)}")
+        console.print(f"   [red]✗ Failed:[/red] {', '.join(failed_loads)}")
+    else:
+        console.print(f"[red]All models failed to load![/red] (0/{len(model_names)})")
+        console.print(f"   [red]✗ Failed:[/red] {', '.join(failed_loads)}")
+    
+    console.print("[dim]Use 'openarc status' to see loaded models.[/dim]")
+    
+    # Exit with error code if any loads failed
+    if failed_loads:
+        ctx.exit(1)
diff --git a/src/cli/groups/serve.py b/src/cli/groups/serve.py
new file mode 100644
index 0000000..a142281
--- /dev/null
+++ b/src/cli/groups/serve.py
@@ -0,0 +1,72 @@
+"""
+Serve command group - Start the OpenArc server.
+"""
+import os
+
+import click
+
+from ..main import cli, console
+
+
+@cli.group()
+def serve():
+    """
+    - Start the OpenArc server.
+    """
+    pass
+
+
+@serve.command("start")
+@click.option("--host", type=str, default="0.0.0.0", show_default=True,
+              help="""
+              - Host to bind the server to
+              """)
+@click.option("--port", 
+              type=int, 
+              default=8000, 
+              show_default=True,
+              help="""
+              - Port to bind the server to
+              """)
+@click.option("--load-models", "--lm",
+              required=False,
+              help="Load models on startup. Specify once followed by space-separated model names.")
+@click.argument('startup_models', nargs=-1, required=False)
+@click.pass_context
+def start(ctx, host, port, load_models, startup_models):
+    """
+    - 'start' reads --host and --port from config or defaults to 0.0.0.0:8000
+    
+    Examples:
+        openarc serve start
+        openarc serve start --load-models model1 model2
+        openarc serve start --lm Dolphin-X1 kokoro whisper
+    """
+    from ..modules.launch_server import start_server
+    
+    # Save server configuration for other CLI commands to use
+    config_path = ctx.obj.server_config.save_server_config(host, port)
+    console.print(f"[dim]Configuration saved to: {config_path}[/dim]")
+    
+    # Handle startup models
+    models_to_load = []
+    if load_models:
+        models_to_load.append(load_models)
+    if startup_models:
+        models_to_load.extend(startup_models)
+    
+    if models_to_load:
+        saved_model_names = ctx.obj.server_config.get_model_names()
+        missing = [m for m in models_to_load if m not in saved_model_names]
+        
+        if missing:
+            console.print("[yellow]Warning: Models not in config (will be skipped):[/yellow]")
+            for m in missing:
+                console.print(f"   • {m}")
+            console.print("[dim]Use 'openarc list' to see saved configurations.[/dim]\n")
+        
+        os.environ["OPENARC_STARTUP_MODELS"] = ",".join(models_to_load)
+        console.print(f"[blue]Models to load on startup:[/blue] {', '.join(models_to_load)}\n")
+    
+    console.print(f"[green]Starting OpenArc server on {host}:{port}[/green]")
+    start_server(host=host, port=port)
diff --git a/src/cli/groups/status.py b/src/cli/groups/status.py
new file mode 100644
index 0000000..ba1d167
--- /dev/null
+++ b/src/cli/groups/status.py
@@ -0,0 +1,67 @@
+"""
+Status command - GET Status of loaded models.
+"""
+import click
+import requests
+from rich.table import Table
+
+from ..main import OpenArcCLI, cli, console
+
+
+@cli.command()
+@click.pass_context
+def status(ctx):
+    """- GET Status of loaded models."""
+    cli_instance = OpenArcCLI(server_config=ctx.obj.server_config)
+    
+    url = f"{cli_instance.base_url}/openarc/status"
+    
+    try:
+        console.print("[blue]Getting model status...[/blue]")
+        response = requests.get(url, headers=cli_instance.get_headers())
+        
+        if response.status_code == 200:
+            result = response.json()
+            models = result.get("models", [])
+            total_models = result.get("total_loaded_models", 0)
+            
+            if not models:
+                console.print("[yellow]No models currently loaded.[/yellow]")
+            else:
+                # Create a table for all models
+                status_table = Table(title=f"Loaded Models ({total_models})")
+                status_table.add_column("model_name", style="cyan", width=20)
+                status_table.add_column("device", style="blue", width=10)
+                status_table.add_column("model_type", style="magenta", width=15)
+                status_table.add_column("engine", style="green", width=10)
+                status_table.add_column("status", style="yellow", width=10)
+                status_table.add_column("time_loaded", style="dim", width=20)
+                
+                for model in models:
+                    model_name = model.get("model_name")
+                    device = model.get("device")
+                    model_type = model.get("model_type")
+                    engine = model.get("engine")
+                    status = model.get("status")
+                    time_loaded = model.get("time_loaded")
+                    
+                    status_table.add_row(
+                        model_name,
+                        device,
+                        model_type,
+                        engine,
+                        status,
+                        time_loaded
+                    )
+                
+                console.print(status_table)
+                console.print(f"\n[green]Total models loaded: {total_models}[/green]")
+            
+        else:
+            console.print(f"[red]Error getting status: {response.status_code}[/red]")
+            console.print(f"[red]Response:[/red] {response.text}")
+            ctx.exit(1)
+            
+    except requests.exceptions.RequestException as e:
+        console.print(f"[red]Request failed:[/red] {e}")
+        ctx.exit(1)
diff --git a/src/cli/groups/tool.py b/src/cli/groups/tool.py
new file mode 100644
index 0000000..dca6705
--- /dev/null
+++ b/src/cli/groups/tool.py
@@ -0,0 +1,86 @@
+"""
+Tool command group - Utility scripts.
+"""
+import click
+from rich.panel import Panel
+from rich.table import Table
+
+from ..main import cli, console
+
+
+@cli.group()
+@click.pass_context
+def tool(ctx):
+    """- Utility scripts."""
+    pass
+
+
+@tool.command('device-props')
+@click.pass_context
+def device_properties(ctx):
+    """
+    - Query OpenVINO device properties for all available devices.
+    """
+    
+    try:
+        from ..modules.device_query import DeviceDataQuery
+        console.print("[blue]Querying device data for all devices...[/blue]")
+        device_query = DeviceDataQuery()
+        available_devices = device_query.get_available_devices()
+        
+        console.print(f"\n[green]Available Devices ({len(available_devices)}):[/green]")
+        
+        if not available_devices:
+            console.print("[red]No devices found![/red]")
+            ctx.exit(1)
+        
+        for device in available_devices:
+            # Create a panel for each device
+            properties = device_query.get_device_properties(device)
+            properties_text = "\n".join([f"{key}: {value}" for key, value in properties.items()])
+            
+            panel = Panel(
+                properties_text,
+                title=f"Device: {device}",
+                title_align="left",
+                border_style="blue"
+            )
+            console.print(panel)
+        
+        console.print(f"\n[green]Found {len(available_devices)} device(s)[/green]")
+        
+    except Exception as e:
+        console.print(f"[red]Error querying device data:[/red] {e}")
+        ctx.exit(1)
+
+
+@tool.command('device-detect')
+@click.pass_context
+def device_detect(ctx):
+    """
+    - Detect available OpenVINO devices.
+    """
+    
+    try:
+        from ..modules.device_query import DeviceDiagnosticQuery
+        console.print("[blue]Detecting OpenVINO devices...[/blue]")
+        diagnostic = DeviceDiagnosticQuery()
+        available_devices = diagnostic.get_available_devices()
+        
+        table = Table()
+        table.add_column("Index", style="cyan", width=2)
+        table.add_column("Device", style="green")
+        
+        if not available_devices:
+            console.print("[red] Sanity test failed: No OpenVINO devices found! Maybe check your drivers?[/red]")
+            ctx.exit(1)
+        
+        for i, device in enumerate(available_devices, 1):
+            table.add_row(str(i), device)
+        
+        console.print(table)
+        console.print(f"\n[green] Sanity test passed: found {len(available_devices)} device(s)[/green]")
+            
+    except Exception as e:
+        console.print(f"[red]Sanity test failed: No OpenVINO devices found! Maybe check your drivers?[/red] {e}")
+        ctx.exit(1)
diff --git a/src/cli/groups/unload.py b/src/cli/groups/unload.py
new file mode 100644
index 0000000..6660fec
--- /dev/null
+++ b/src/cli/groups/unload.py
@@ -0,0 +1,78 @@
+"""
+Unload command - Unload one or more models from registry and memory.
+"""
+import click
+import requests
+
+from ..main import OpenArcCLI, cli, console
+
+
+@cli.command()
+@click.argument('model_names', nargs=-1, required=True)
+@click.pass_context
+def unload(ctx, model_names):
+    """
+    - Unload one or more models from registry and memory.
+    
+    Examples:
+        openarc unload model1
+        openarc unload Dolphin-X1 kokoro whisper
+    """
+    cli_instance = OpenArcCLI(server_config=ctx.obj.server_config)
+
+    model_names = list(model_names)
+    
+    # Track results
+    successful_unloads = []
+    failed_unloads = []
+    
+    # Start unloading queue
+    if len(model_names) > 1:
+        console.print(f"[blue]Starting unload queue...[/blue] ({len(model_names)} models)\n")
+    
+    # Unload each model
+    for idx, name in enumerate(model_names, 1):
+        # Show progress indicator for multiple models
+        if len(model_names) > 1:
+            console.print(f"[cyan]({idx}/{len(model_names)})[/cyan] [blue]unloading[/blue] {name}")
+        else:
+            console.print(f"[blue]unloading[/blue] {name}")
+        
+        url = f"{cli_instance.base_url}/openarc/unload"
+        payload = {"model_name": name}
+        
+        try:
+            console.print("[cyan]...working[/cyan]")
+            response = requests.post(url, json=payload, headers=cli_instance.get_headers())
+            
+            if response.status_code == 200:
+                result = response.json()
+                message = result.get('message', f"Model '{name}' unloaded successfully")
+                console.print(f"[green]{message}[/green]\n")
+                successful_unloads.append(name)
+            else:
+                console.print(f"[red]error: {response.status_code}[/red]")
+                console.print(f"[red]Response:[/red] {response.text}\n")
+                failed_unloads.append(name)
+                
+        except requests.exceptions.RequestException as e:
+            console.print(f"[red]Request failed:[/red] {e}\n")
+            failed_unloads.append(name)
+    
+    # Summary
+    console.print("─" * 60)
+    if successful_unloads and not failed_unloads:
+        console.print(f"[green]All models unloaded![/green] ({len(successful_unloads)}/{len(model_names)})")
+    elif successful_unloads and failed_unloads:
+        console.print(f"[yellow]Partial success:[/yellow] {len(successful_unloads)}/{len(model_names)} models unloaded")
+        console.print(f"   [green]✓ Unloaded:[/green] {', '.join(successful_unloads)}")
+        console.print(f"   [red]✗ Failed:[/red] {', '.join(failed_unloads)}")
+    else:
+        console.print(f"[red]All models failed to unload![/red] (0/{len(model_names)})")
+        console.print(f"   [red]✗ Failed:[/red] {', '.join(failed_unloads)}")
+    
+    console.print("[dim]Use 'openarc status' to see loaded models.[/dim]")
+    
+    # Exit with error code if any unloads failed
+    if failed_unloads:
+        ctx.exit(1)
diff --git a/src/cli/main.py b/src/cli/main.py
new file mode 100644
index 0000000..d4afc01
--- /dev/null
+++ b/src/cli/main.py
@@ -0,0 +1,133 @@
+#!/usr/bin/env python3
+"""
+OpenArc CLI Tool - Main entry point and core infrastructure.
+"""
+import os
+
+import rich_click as click
+from rich.console import Console
+from rich.text import Text
+
+click.rich_click.STYLE_OPTIONS_TABLE_LEADING = 1
+click.rich_click.STYLE_OPTIONS_TABLE_BOX = "SIMPLE"
+click.rich_click.STYLE_COMMANDS_TABLE_SHOW_LINES = True
+click.rich_click.STYLE_COMMANDS_TABLE_BORDER_STYLE = "red"
+click.rich_click.STYLE_COMMANDS_TABLE_ROW_STYLES = ["magenta", "yellow", "cyan", "green"]
+
+console = Console()
+
+
+class CLIContext:
+    """Context object for lazy-loading heavy dependencies."""
+    __slots__ = ('_server_config', '_benchmark_db')
+    
+    def __init__(self):
+        self._server_config = None
+        self._benchmark_db = None
+    
+    @property
+    def server_config(self):
+        """Lazy-load ServerConfig only when needed."""
+        if self._server_config is None:
+            from .modules.server_config import ServerConfig
+            self._server_config = ServerConfig()
+        return self._server_config
+    
+    @property
+    def benchmark_db(self):
+        """Lazy-load BenchmarkDB only when needed."""
+        if self._benchmark_db is None:
+            from .modules.benchmark import BenchmarkDB
+            self._benchmark_db = BenchmarkDB()
+        return self._benchmark_db
+
+
+class OpenArcCLI:
+    def __init__(self, base_url=None, api_key=None, server_config=None):
+        if base_url is None and server_config is not None:
+            base_url = server_config.get_base_url()
+        self.base_url = base_url
+        self.api_key = api_key or os.getenv('OPENARC_API_KEY')
+        
+    def get_headers(self):
+        """Get headers for API requests."""
+        headers = {'Content-Type': 'application/json'}
+        if self.api_key:
+            headers['Authorization'] = f'Bearer {self.api_key}'
+        return headers
+
+
+class ColoredAsciiArtGroup(click.RichGroup):
+    """Custom Click group with cached ASCII art banner for performance."""
+    
+    # Cache ASCII art as class attribute (built once, reused forever)
+    _ascii_art_cache = None
+    
+    @classmethod
+    def _build_ascii_art(cls) -> Text:
+        """Build and cache the ASCII art banner."""
+        if cls._ascii_art_cache is None:
+            # Build entire ASCII art in one go for better performance
+            ascii_lines = [
+                (" _____                   ___           \n", "blue"),
+                ("|  _  |                 / _ \\          \n", "blue"),
+                ("| | | |_ __   ___ _ __ / /_\\ \\_ __ ___ \n", "blue"),
+                ("| | | | '_ \\ / _ \\ '_ \\|  _  | '__/ __|\n", "blue"),
+                ("\\ \\_/ / |_) |  __/ | | | | | | | | (__ \n", "blue"),
+                (" \\___/| .__/ \\___|_| |_\\_| |_/_|  \\___|\n", "blue"),
+                ("      | |                              \n", "blue"),
+                ("      |_|                              \n", "blue"),
+                (" \n", "white"),
+                (" You know who ELSE uses OpenArc?\n", "white"),
+            ]
+            
+            art = Text()
+            for line, style in ascii_lines:
+                art.append(line, style=style)
+            
+            cls._ascii_art_cache = art
+        
+        return cls._ascii_art_cache
+    
+    def get_help(self, ctx):
+        """Display help with pre-cached ASCII art banner."""
+        console.print(self._build_ascii_art())
+        return super().get_help(ctx)
+
+
+@click.group(cls=ColoredAsciiArtGroup)
+@click.pass_context
+def cli(ctx):
+    """
+    Use this application to interface with the OpenArc server.
+    
+    Features:
+
+    • Start the OpenArc server.
+    
+    • Load models into the OpenArc server.
+    
+    • List models from saved configurations.
+
+    • Check the status of loaded models.
+
+    • Unload models.
+
+    • Benchmark model performance.
+
+    • Query device properties.
+
+    • Query installed devices.
+
+
+    To get started add --help to one of the commands below to view its documentation.
+    """
+    ctx.ensure_object(CLIContext)
+
+
+# Import command groups to register them with the CLI
+from .groups import add, bench, list, load, serve, status, tool, unload  # noqa: E402, F401
+
+
+if __name__ == "__main__":
+    cli()
diff --git a/src/cli/modules/__init__.py b/src/cli/modules/__init__.py
new file mode 100644
index 0000000..44bcb88
--- /dev/null
+++ b/src/cli/modules/__init__.py
@@ -0,0 +1,3 @@
+"""
+Support modules for OpenArc CLI.
+"""
diff --git a/src/cli/benchmark.py b/src/cli/modules/benchmark.py
similarity index 98%
rename from src/cli/benchmark.py
rename to src/cli/modules/benchmark.py
index 3b899a1..566626d 100644
--- a/src/cli/benchmark.py
+++ b/src/cli/modules/benchmark.py
@@ -18,7 +18,7 @@ def __init__(self, db_file: Optional[Path] = None):
             db_file: Path to the database file. If None, defaults to openarc_bench.db in project root.
         """
         if db_file is None:
-            project_root = Path(__file__).parent.parent.parent
+            project_root = Path(__file__).parent.parent.parent.parent
             db_file = project_root / "openarc_bench.db"
         
         self.db_file = Path(db_file)
diff --git a/src/cli/device_query.py b/src/cli/modules/device_query.py
similarity index 100%
rename from src/cli/device_query.py
rename to src/cli/modules/device_query.py
diff --git a/src/cli/launch_server.py b/src/cli/modules/launch_server.py
similarity index 92%
rename from src/cli/launch_server.py
rename to src/cli/modules/launch_server.py
index 5227021..246af27 100644
--- a/src/cli/launch_server.py
+++ b/src/cli/modules/launch_server.py
@@ -3,7 +3,7 @@
 from pathlib import Path
 
 # Configure logging
-log_file = Path(__file__).parent.parent.parent / "openarc.log"
+log_file = Path(__file__).parent.parent.parent.parent / "openarc.log"
 
 # Create a custom logging configuration for uvicorn
 LOG_CONFIG = {
@@ -74,7 +74,7 @@
 
 logger = logging.getLogger("OpenArc")
 
-def start_server(host: str = "0.0.0.0", openarc_port: int = 8001, reload: bool = False):
+def start_server(host: str = "0.0.0.0", port: int = 8001, reload: bool = False):
     """
     Launches the OpenArc API server
     
@@ -82,7 +82,7 @@ def start_server(host: str = "0.0.0.0", openarc_port: int = 8001, reload: bool =
         host: Host to bind the server to
         port: Port to bind the server to
     """
-    logger.info(f"Launching  {host}:{openarc_port}")
+    logger.info(f"Launching  {host}:{port}")
     logger.info("--------------------------------")
     logger.info("OpenArc endpoints:")
     logger.info("  - POST   /openarc/load           Load a model")
@@ -101,7 +101,7 @@ def start_server(host: str = "0.0.0.0", openarc_port: int = 8001, reload: bool =
     uvicorn.run(
         "src.server.main:app",
         host=host,
-        port=openarc_port,
+        port=port,
         log_config=LOG_CONFIG,
         reload=reload
     )
\ No newline at end of file
diff --git a/src/cli/server_config.py b/src/cli/modules/server_config.py
similarity index 98%
rename from src/cli/server_config.py
rename to src/cli/modules/server_config.py
index 856bf31..62dec8d 100644
--- a/src/cli/server_config.py
+++ b/src/cli/modules/server_config.py
@@ -19,7 +19,7 @@ def __init__(self, config_file: Optional[Path] = None):
             config_file: Path to the config file. If None, defaults to openarc_config.json in project root.
         """
         if config_file is None:
-            project_root = Path(__file__).parent.parent.parent
+            project_root = Path(__file__).parent.parent.parent.parent
             config_file = project_root / "openarc_config.json"
         
         self.config_file = Path(config_file)
diff --git a/src/cli/openarc_cli.py b/src/cli/openarc_cli.py
deleted file mode 100644
index d0581d0..0000000
--- a/src/cli/openarc_cli.py
+++ /dev/null
@@ -1,898 +0,0 @@
-#!/usr/bin/env python3
-"""
-OpenArc CLI Tool - Command-line interface for OpenArc server operations.
-"""
-import json
-import os
-import uuid
-from pathlib import Path
-
-import requests
-import rich_click as click
-from rich.console import Console
-from rich.panel import Panel
-from rich.progress import Progress, SpinnerColumn, TextColumn
-from rich.table import Table
-from rich.text import Text
- 
-click.rich_click.STYLE_OPTIONS_TABLE_LEADING = 1
-click.rich_click.STYLE_OPTIONS_TABLE_BOX = "SIMPLE"
-click.rich_click.STYLE_COMMANDS_TABLE_SHOW_LINES = True
-click.rich_click.STYLE_COMMANDS_TABLE_BORDER_STYLE = "red"
-click.rich_click.STYLE_COMMANDS_TABLE_ROW_STYLES = ["magenta", "yellow", "cyan", "green"]
-
-console = Console()
-
-def validate_model_path(model_path):
-    """
-    Validate that model_path contains OpenVINO model files.
-    Checks for at least one file with "_model.bin" and one file with "_model.xml" in filename.
-    Returns True if valid, False otherwise.
-    """
-    path = Path(model_path)
-    
-    # Resolve the path
-    if not path.exists():
-        return False
-    
-    # Determine search directory - if path is a file, use its parent; if directory, use it
-    if path.is_file():
-        search_dir = path.parent
-    else:
-        search_dir = path
-    
-    # Check for required files
-    has_bin = False
-    has_xml = False
-    
-    try:
-        for file_path in search_dir.rglob("*"):
-            if file_path.is_file():
-                filename = file_path.name
-                if "_model.bin" in filename:
-                    has_bin = True
-                if "_model.xml" in filename:
-                    has_xml = True
-                if has_bin and has_xml:
-                    return True
-    except (OSError, PermissionError):
-        return False
-    
-    return False
-
-class CLIContext:
-    """Context object for lazy-loading heavy dependencies."""
-    __slots__ = ('_server_config', '_benchmark_db')
-    
-    def __init__(self):
-        self._server_config = None
-        self._benchmark_db = None
-    
-    @property
-    def server_config(self):
-        """Lazy-load ServerConfig only when needed."""
-        if self._server_config is None:
-            from .server_config import ServerConfig
-            self._server_config = ServerConfig()
-        return self._server_config
-    
-    @property
-    def benchmark_db(self):
-        """Lazy-load BenchmarkDB only when needed."""
-        if self._benchmark_db is None:
-            from .benchmark import BenchmarkDB
-            self._benchmark_db = BenchmarkDB()
-        return self._benchmark_db
-
-
-class OpenArcCLI:
-    def __init__(self, base_url=None, api_key=None, server_config=None):
-        if base_url is None and server_config is not None:
-            base_url = server_config.get_base_url()
-        self.base_url = base_url
-        self.api_key = api_key or os.getenv('OPENARC_API_KEY')
-        
-    def get_headers(self):
-        """Get headers for API requests."""
-        headers = {'Content-Type': 'application/json'}
-        if self.api_key:
-            headers['Authorization'] = f'Bearer {self.api_key}'
-        return headers
-
-class ColoredAsciiArtGroup(click.RichGroup):
-    """Custom Click group with cached ASCII art banner for performance."""
-    
-    # Cache ASCII art as class attribute (built once, reused forever)
-    _ascii_art_cache = None
-    
-    @classmethod
-    def _build_ascii_art(cls) -> Text:
-        """Build and cache the ASCII art banner."""
-        if cls._ascii_art_cache is None:
-            # Build entire ASCII art in one go for better performance
-            ascii_lines = [
-                (" _____                   ___           \n", "blue"),
-                ("|  _  |                 / _ \\          \n", "blue"),
-                ("| | | |_ __   ___ _ __ / /_\\ \\_ __ ___ \n", "blue"),
-                ("| | | | '_ \\ / _ \\ '_ \\|  _  | '__/ __|\n", "blue"),
-                ("\\ \\_/ / |_) |  __/ | | | | | | | | (__ \n", "blue"),
-                (" \\___/| .__/ \\___|_| |_\\_| |_/_|  \\___|\n", "blue"),
-                ("      | |                              \n", "blue"),
-                ("      |_|                              \n", "blue"),
-                (" \n", "white"),
-                (" Making AI go brr since 2025   \n", "white"),
-            ]
-            
-            art = Text()
-            for line, style in ascii_lines:
-                art.append(line, style=style)
-            
-            cls._ascii_art_cache = art
-        
-        return cls._ascii_art_cache
-    
-    def get_help(self, ctx):
-        """Display help with pre-cached ASCII art banner."""
-        console.print(self._build_ascii_art())
-        return super().get_help(ctx)
-
-@click.group(cls=ColoredAsciiArtGroup)
-@click.pass_context
-def cli(ctx):
-    """
-    Use this application to interface with the OpenArc server.
-    
-    Features:
-
-    • Start the OpenArc server.
-    
-    • Load models into the OpenArc server.
-    
-    • List models from saved configurations.
-
-    • Check the status of loaded models.
-
-    • Unload models.
-
-    • Benchmark model performance.
-
-    • Query device properties.
-
-    • Query installed devices.
-
-
-    To get started add --help to one of the commands below to view its documentation.
-    """
-    ctx.ensure_object(CLIContext)
-
-@cli.command()
-@click.option('--model-name', '--mn',
-    required=True,
-    help='Public facing name of the model.')
-@click.option('--model-path', '--m',
-    required=True, 
-    help='Path to OpenVINO IR converted model.')
-@click.option('--engine', '--en',
-    type=click.Choice(['ovgenai', 'openvino', 'optimum']),
-    required=True,
-    help='Engine used to load the model (ovgenai, openvino, optimum)')
-@click.option('--model-type', '--mt',
-    type=click.Choice(['llm', 'vlm', 'whisper', 'kokoro', 'emb', 'rerank']),
-    required=True,
-    help='Model type (llm, vlm, whisper, kokoro, emb, rerank)')
-@click.option('--device', '--d',
-    required=True,
-    help='Device(s) to load the model on.')
-@click.option("--runtime-config", "--rtc",
-    default=None,
-    help='OpenVINO runtime configuration as JSON string (e.g., \'{"MODEL_DISTRIBUTION_POLICY": "PIPELINE_PARALLEL"}\').')
-@click.option('--vlm-type', '--vt',
-    type=click.Choice(['internvl2', 'llava15', 'llavanext', 'minicpmv26', 'phi3vision', 'phi4mm', 'qwen2vl', 'qwen25vl', 'gemma3']),
-    required=False,
-    default=None,
-    help='Vision model type. Used to map correct vision tokens.')
-@click.option('--draft-model-path', '--dmp',
-    required=False,
-    default=None,
-    help='Path to draft model for speculative decoding.')
-@click.option('--draft-device', '--dd',
-    required=False,
-    default=None,
-    help='Device for draft model (e.g., CPU, GPU).')
-@click.option('--num-assistant-tokens', '--nat',
-    required=False,
-    default=None,
-    type=int,
-    help='Number of tokens draft model generates per step (typically 2-5).')
-@click.option('--assistant-confidence-threshold', '--act',
-    required=False,
-    default=None,
-    type=float,
-    help='Confidence threshold for accepting draft tokens (typically 0.3-0.5).')
-@click.pass_context
-def add(ctx, model_path, model_name, engine, model_type, device, runtime_config, vlm_type, draft_model_path, draft_device, num_assistant_tokens, assistant_confidence_threshold):
-    """- Add a model configuration to the config file."""
-    
-    # Validate model path
-    if not validate_model_path(model_path):
-        console.print(f"[red]Model file check failed! {model_path} does not contain openvino model files OR your chosen path is malformed. Verify chosen path is correct and acquired model files match source on the hub, or the destination of converted model.[/red]")
-        ctx.exit(1)
-    
-    # Parse runtime_config if provided
-    parsed_runtime_config = {}
-    if runtime_config:
-        try:
-            parsed_runtime_config = json.loads(runtime_config)
-            if not isinstance(parsed_runtime_config, dict):
-                console.print(f"[red]Error: runtime_config must be a JSON object (dictionary), got {type(parsed_runtime_config).__name__}[/red]")
-                console.print('[yellow]Example format: \'{"MODEL_DISTRIBUTION_POLICY": "PIPELINE_PARALLEL"}\'[/yellow]')
-                ctx.exit(1)
-        except json.JSONDecodeError as e:
-            console.print(f"[red]Error parsing runtime_config JSON:[/red] {e}")
-            console.print('[yellow]Example format: \'{"MODEL_DISTRIBUTION_POLICY": "PIPELINE_PARALLEL"}\'[/yellow]')
-            ctx.exit(1)
-    
-    # Build and save configuration
-    load_config = {
-        "model_name": model_name,
-        "model_path": model_path,  
-        "model_type": model_type,  
-        "engine": engine,    
-        "device": device,
-        "runtime_config": parsed_runtime_config,
-        "vlm_type": vlm_type if vlm_type else None
-    }
-    
-    # Add speculative decoding options if provided
-    if draft_model_path:
-        load_config["draft_model_path"] = draft_model_path
-    if draft_device:
-        load_config["draft_device"] = draft_device
-    if num_assistant_tokens is not None:
-        load_config["num_assistant_tokens"] = num_assistant_tokens
-    if assistant_confidence_threshold is not None:
-        load_config["assistant_confidence_threshold"] = assistant_confidence_threshold
-    
-    ctx.obj.server_config.save_model_config(model_name, load_config)
-    console.print(f"[green]Model configuration saved:[/green] {model_name}")
-    console.print(f"[dim]Use 'openarc load {model_name}' to load this model.[/dim]")
-
-@cli.command()
-@click.argument('model_names', nargs=-1, required=True)
-@click.pass_context
-def load(ctx, model_names):
-    """- Load one or more models from saved configuration.
-    
-    Examples:
-        openarc load model1
-        openarc load Dolphin-X1 kokoro whisper
-    """
-    cli_instance = OpenArcCLI(server_config=ctx.obj.server_config)
-    
-    model_names = list(model_names)
-    
-    # Track results
-    successful_loads = []
-    failed_loads = []
-    
-    # Start loading queue
-    if len(model_names) > 1:
-        console.print(f"[blue]Starting load queue...[/blue] ({len(model_names)} models)\n")
-    
-    # Load each model
-    for idx, name in enumerate(model_names, 1):
-        # Show progress indicator for multiple models
-        if len(model_names) > 1:
-            console.print(f"[cyan]({idx}/{len(model_names)})[/cyan] [blue]loading[/blue] {name}")
-        else:
-            console.print(f"[blue]loading[/blue] {name}")
-        
-        # Get saved configuration
-        saved_config = ctx.obj.server_config.get_model_config(name)
-        
-        if not saved_config:
-            console.print(f"[red]Model configuration not found:[/red] {name}")
-            console.print("[yellow]Tip: Use 'openarc list' to see saved configurations.[/yellow]\n")
-            failed_loads.append(name)
-            continue
-        
-        load_config = saved_config.copy()
-        
-        # Validate model path
-        model_path = load_config.get('model_path')
-        if model_path and not validate_model_path(model_path):
-            console.print(f"[red]Model file check failed! {model_path} does not contain openvino model files OR your chosen path is malformed. Verify chosen path is correct and acquired model files match source on the hub, or the destination of converted model.[/red]")
-            failed_loads.append(name)
-            continue
-        
-        # Make API request to load the model
-        url = f"{cli_instance.base_url}/openarc/load"
-        
-        try:
-            console.print("[cyan]...working[/cyan]")
-            response = requests.post(url, json=load_config, headers=cli_instance.get_headers())
-            
-            if response.status_code == 200:
-                console.print(f"[green]{name} loaded![/green]\n")
-                successful_loads.append(name)
-            else:
-                console.print(f"[red]error: {response.status_code}[/red]")
-                console.print(f"[red]Response:[/red] {response.text}\n")
-                failed_loads.append(name)
-                
-        except requests.exceptions.RequestException as e:
-            console.print(f"[red]Request failed:[/red] {e}\n")
-            failed_loads.append(name)
-    
-    # Summary
-    console.print("─" * 60)
-    if successful_loads and not failed_loads:
-        console.print(f"[green]All models loaded![/green] ({len(successful_loads)}/{len(model_names)})")
-    elif successful_loads and failed_loads:
-        console.print(f"[yellow]Partial success:[/yellow] {len(successful_loads)}/{len(model_names)} models loaded")
-        console.print(f"   [green]✓ Loaded:[/green] {', '.join(successful_loads)}")
-        console.print(f"   [red]✗ Failed:[/red] {', '.join(failed_loads)}")
-    else:
-        console.print(f"[red]All models failed to load![/red] (0/{len(model_names)})")
-        console.print(f"   [red]✗ Failed:[/red] {', '.join(failed_loads)}")
-    
-    console.print("[dim]Use 'openarc status' to see loaded models.[/dim]")
-    
-    # Exit with error code if any loads failed
-    if failed_loads:
-        ctx.exit(1)
-
-@cli.command()
-@click.argument('model_names', nargs=-1, required=True)
-@click.pass_context
-def unload(ctx, model_names):
-    """
-    - Unload one or more models from registry and memory.
-    
-    Examples:
-        openarc unload model1
-        openarc unload Dolphin-X1 kokoro whisper
-    """
-    cli_instance = OpenArcCLI(server_config=ctx.obj.server_config)
-
-    model_names = list(model_names)
-    
-    # Track results
-    successful_unloads = []
-    failed_unloads = []
-    
-    # Start unloading queue
-    if len(model_names) > 1:
-        console.print(f"[blue]Starting unload queue...[/blue] ({len(model_names)} models)\n")
-    
-    # Unload each model
-    for idx, name in enumerate(model_names, 1):
-        # Show progress indicator for multiple models
-        if len(model_names) > 1:
-            console.print(f"[cyan]({idx}/{len(model_names)})[/cyan] [blue]unloading[/blue] {name}")
-        else:
-            console.print(f"[blue]unloading[/blue] {name}")
-        
-        url = f"{cli_instance.base_url}/openarc/unload"
-        payload = {"model_name": name}
-        
-        try:
-            console.print("[cyan]...working[/cyan]")
-            response = requests.post(url, json=payload, headers=cli_instance.get_headers())
-            
-            if response.status_code == 200:
-                result = response.json()
-                message = result.get('message', f"Model '{name}' unloaded successfully")
-                console.print(f"[green]{message}[/green]\n")
-                successful_unloads.append(name)
-            else:
-                console.print(f"[red]error: {response.status_code}[/red]")
-                console.print(f"[red]Response:[/red] {response.text}\n")
-                failed_unloads.append(name)
-                
-        except requests.exceptions.RequestException as e:
-            console.print(f"[red]Request failed:[/red] {e}\n")
-            failed_unloads.append(name)
-    
-    # Summary
-    console.print("─" * 60)
-    if successful_unloads and not failed_unloads:
-        console.print(f"[green]All models unloaded![/green] ({len(successful_unloads)}/{len(model_names)})")
-    elif successful_unloads and failed_unloads:
-        console.print(f"[yellow]Partial success:[/yellow] {len(successful_unloads)}/{len(model_names)} models unloaded")
-        console.print(f"   [green]✓ Unloaded:[/green] {', '.join(successful_unloads)}")
-        console.print(f"   [red]✗ Failed:[/red] {', '.join(failed_unloads)}")
-    else:
-        console.print(f"[red]All models failed to unload![/red] (0/{len(model_names)})")
-        console.print(f"   [red]✗ Failed:[/red] {', '.join(failed_unloads)}")
-    
-    console.print("[dim]Use 'openarc status' to see loaded models.[/dim]")
-    
-    # Exit with error code if any unloads failed
-    if failed_unloads:
-        ctx.exit(1)
-
-@cli.command("list")
-@click.option('--model-name','--mn', help='Model name to remove (used with --remove/--rm).')
-@click.option('--remove', '--rm', is_flag=True, help='Remove a model configuration.')
-@click.pass_context
-def list_configs(ctx, remove, model_name):
-    """- List saved model configurations.
-       
-       - Remove a model configuration."""
-    if remove:
-        if not model_name:
-            console.print("[red]Error:[/red] --model-name is required when using --remove")
-
-            ctx.exit(1)
-        
-        # Check if model exists before trying to remove
-        if not ctx.obj.server_config.model_exists(model_name):
-            console.print(f"{model_name}[red] not found:[/red]")
-            console.print("[yellow]Use 'openarc list' to see available configurations.[/yellow]")
-            ctx.exit(1)
-        
-        # Remove the configuration
-        if ctx.obj.server_config.remove_model_config(model_name):
-            console.print(f"[green]Model configuration removed:[/green] {model_name}")
-        else:
-            console.print(f"[red]Failed to remove model configuration:[/red] {model_name}")
-            ctx.exit(1)
-        return
-    
-    models = ctx.obj.server_config.get_all_models()
-    
-    if not models:
-        console.print("[yellow]No model configurations found.[/yellow]")
-        console.print("[dim]Use 'openarc add --help' to see how to save configurations.[/dim]")
-        return
-    
-    console.print(f"[blue]Saved Model Configurations ({len(models)}):[/blue]\n")
-    
-    for model_name, model_config in models.items():
-        # Create a table for each model configuration
-        config_table = Table(show_header=False, box=None, pad_edge=False)
-        
-
-        config_table.add_row("model_name", f"[cyan]{model_name}[/cyan]")
-        config_table.add_row("model_path", f"[yellow]{model_config.get('model_path')}[/yellow]")
-        config_table.add_row("device", f"[blue]{model_config.get('device')}[/blue]")
-        config_table.add_row("engine", f"[green]{model_config.get('engine')}[/green]")
-        config_table.add_row("model_type", f"[magenta]{model_config.get('model_type')}[/magenta]")
-
-        # Display optional fields when available
-        if model_config.get('draft_model_path'):
-            config_table.add_row("draft_model_path", f"[red]{model_config.get('draft_model_path')}[/red]")
-        if model_config.get('draft_device'):
-            config_table.add_row("draft_device", f"[red]{model_config.get('draft_device')}[/red]")
-        if model_config.get('num_assistant_tokens') is not None:
-            config_table.add_row("num_assistant_tokens", f"[red]{model_config.get('num_assistant_tokens')}[/red]")
-        if model_config.get('assistant_confidence_threshold') is not None:
-            config_table.add_row("assistant_confidence_threshold", f"[red]{model_config.get('assistant_confidence_threshold')}[/red]")
-
-
-        rtc = model_config.get('runtime_config', {})
-        if rtc:
-            config_table.add_row("", "")
-            config_table.add_row(Text("runtime_config", style="bold underline yellow"), "")
-            for key, value in rtc.items():
-                config_table.add_row(f"  {key}", f"[dim]{value}[/dim]")
-        
-        panel = Panel(
-            config_table,
-            border_style="green"
-        )
-        console.print(panel)
-    
-    console.print("\n[dim]To load saved configurations: openarc load <model_name> [model_name2 ...][/dim]")
-    console.print("[dim]To remove a configuration: openarc list --remove --model-name <model_name>[/dim]")
-
-@cli.command()
-@click.pass_context
-def status(ctx):
-    """- GET Status of loaded models."""
-    cli_instance = OpenArcCLI(server_config=ctx.obj.server_config)
-    
-    url = f"{cli_instance.base_url}/openarc/status"
-    
-    try:
-        console.print("[blue]Getting model status...[/blue]")
-        response = requests.get(url, headers=cli_instance.get_headers())
-        
-        if response.status_code == 200:
-            result = response.json()
-            models = result.get("models", [])
-            total_models = result.get("total_loaded_models", 0)
-            
-            if not models:
-                console.print("[yellow]No models currently loaded.[/yellow]")
-            else:
-                # Create a table for all models
-                status_table = Table(title=f"Loaded Models ({total_models})")
-                status_table.add_column("model_name", style="cyan", width=20)
-                status_table.add_column("device", style="blue", width=10)
-                status_table.add_column("model_type", style="magenta", width=15)
-                status_table.add_column("engine", style="green", width=10)
-                status_table.add_column("status", style="yellow", width=10)
-                status_table.add_column("time_loaded", style="dim", width=20)
-                
-                for model in models:
-                    model_name = model.get("model_name")
-                    device = model.get("device")
-                    model_type = model.get("model_type")
-                    engine = model.get("engine")
-                    status = model.get("status")
-                    time_loaded = model.get("time_loaded")
-                    
-                    status_table.add_row(
-                        model_name,
-                        device,
-                        model_type,
-                        engine,
-                        status,
-                        time_loaded
-                    )
-                
-                console.print(status_table)
-                console.print(f"\n[green]Total models loaded: {total_models}[/green]")
-            
-        else:
-            console.print(f"[red]Error getting status: {response.status_code}[/red]")
-            console.print(f"[red]Response:[/red] {response.text}")
-            ctx.exit(1)
-            
-    except requests.exceptions.RequestException as e:
-        console.print(f"[red]Request failed:[/red] {e}")
-        ctx.exit(1)
-
-@cli.command()
-@click.argument('model_name')
-@click.option('--input_tokens', '--p', multiple=True, default=['512'],
-              help='Number of prompt tokens. Can be comma-separated (e.g., --p 16,32) or specified multiple times (e.g., -p 16 -p 32). Default: 512')
-@click.option('--max_tokens', '--n', multiple=True, default=['128'],
-              help='Number of tokens to generate. Can be comma-separated or specified multiple times. Default: 128')
-@click.option('--runs', '--r', default=5, type=int,
-              help='Number of times to repeat each benchmark. Default: 5')
-@click.option('--temperature', '--temp', default=None, type=float,
-              help='Sampling temperature (default: 1.0)')
-@click.option('--top-k', '--k', default=None, type=int,
-              help='Top-k sampling (default: 50)')
-@click.option('--top-p', '--p-nucleus', default=None, type=float,
-              help='Top-p nucleus sampling (default: 1.0)')
-@click.option('--repetition-penalty', '--rep', default=None, type=float,
-              help='Repetition penalty (default: 1.0)')
-@click.pass_context
-def bench(ctx, model_name, input_tokens, max_tokens, runs, temperature, top_k, top_p, repetition_penalty):
-    """- Benchmark inference with pseudo-random input tokens.
-    
-    Examples:
-        openarc bench Dolphin-X1
-        openarc bench Dolphin-X1 --p 512 --n 128 -r 10
-        openarc bench Dolphin-X1 --p 16,32,64 --n 128,256
-        openarc bench Dolphin-X1 -p 16 -p 32 -n 128 -n 256
-    """
-    cli_instance = OpenArcCLI(server_config=ctx.obj.server_config)
-    
-    # Parse input_tokens and max_tokens (handle comma-separated and multiple invocations)
-    p_values = []
-    for pt in input_tokens:
-        p_values.extend([int(x.strip()) for x in pt.split(',')])
-    
-    n_values = []
-    for nt in max_tokens:
-        n_values.extend([int(x.strip()) for x in nt.split(',')])
-    
-    # Check if model exists
-    try:
-        console.print("[cyan]working...[/cyan]\n")
-        models_url = f"{cli_instance.base_url}/v1/models"
-        models_response = requests.get(models_url, headers=cli_instance.get_headers())
-        
-        if models_response.status_code != 200:
-            console.print(f"[red]Failed to get model list: {models_response.status_code}[/red]")
-            ctx.exit(1)
-        
-        models_data = models_response.json()
-        available_models = [m['id'] for m in models_data.get('data', [])]
-        
-        if model_name not in available_models:
-            console.print(f"[red]'{model_name}' not found in loaded models[/red]")
-            console.print(f"[yellow]Available models: {', '.join(available_models)}[/yellow]")
-            console.print("[dim]Use 'openarc status' to see loaded models.[/dim]")
-            ctx.exit(1)
-        
-        
-    except requests.exceptions.RequestException as e:
-        console.print(f"[red]Request failed:[/red] {e}")
-        ctx.exit(1)
-    
-    # Get model path from config to generate input tokens
-    model_config = ctx.obj.server_config.get_model_config(model_name)
-    if not model_config:
-        console.print(f"[red]Model configuration not found for '{model_name}'[/red]")
-        console.print("[yellow]Cannot generate random tokens without model path.[/yellow]")
-        console.print("[blue]Use 'openarc list' to see saved configurations.[/blue]")
-        ctx.exit(1)
-    
-    model_path = model_config.get('model_path')
-    if not model_path:
-        console.print("[red]model_path not found in configuration[/red]")
-        ctx.exit(1)
-    
-    # Validate model path
-    if not validate_model_path(model_path):
-        console.print(f"[red]Model file check failed! {model_path} does not contain openvino model files OR your chosen path is malformed. Verify chosen path is correct and acquired model files match source on the hub, or the destination of converted model.[/red]")
-        ctx.exit(1)
-    
-    # Run benchmarks
-    console.print(f"input tokens: {p_values}")
-    console.print(f"max tokens:   {n_values}")
-    console.print(f"runs: {runs}\n")
-    
-    # Lazy load OpenArcBenchmarks
-    from .benchmark import OpenArcBenchmarks
-    
-    # Generate unique run_id for this benchmark session
-    run_id = str(uuid.uuid4())
-    
-    total_runs = len(p_values) * len(n_values) * runs
-    results = []
-    
-    with Progress(
-        SpinnerColumn(),
-        TextColumn("[progress.description]{task.description}"),
-        console=console
-    ) as progress:
-        task = progress.add_task(f"Running... (0/{total_runs})", total=total_runs)
-        
-        run_count = 0
-        for p in p_values:
-            for n in n_values:
-                for r in range(runs):
-                    run_count += 1
-                    progress.update(task, description=f"[dim]benching...[/dim] ({run_count}/{total_runs}) [p={p}, n={n}, r={r+1}/{runs}]")
-                    
-                    try:
-                        # Generate random input tokens
-                        input_ids = OpenArcBenchmarks.random_input_ids(model_path, p)
-                        
-                        # Make benchmark request
-                        bench_url = f"{cli_instance.base_url}/openarc/bench"
-                        payload = {
-                            "model": model_name,
-                            "input_ids": input_ids,
-                            "max_tokens": n
-                        }
-
-                        # Add optional parameters if provided
-                        if temperature is not None:
-                            payload["temperature"] = temperature
-                        if top_k is not None:
-                            payload["top_k"] = top_k
-                        if top_p is not None:
-                            payload["top_p"] = top_p
-                        if repetition_penalty is not None:
-                            payload["repetition_penalty"] = repetition_penalty
-
-                        bench_response = requests.post(
-                            bench_url,
-                            headers=cli_instance.get_headers(),
-                            json=payload
-                        )
-                        
-                        if bench_response.status_code != 200:
-                            console.print(f"\n[red]Benchmark request failed: {bench_response.status_code}[/red]")
-                            console.print(f"[red]Response:[/red] {bench_response.text}")
-                            continue
-                        
-                        metrics = bench_response.json().get('metrics', {})
-                        
-                        # Store individual result
-                        result = {
-                            'p': p,
-                            'n': n,
-                            'run': r + 1,
-                            'ttft': metrics.get('ttft (s)', 0),
-                            'tpot': metrics.get('tpot (ms)', 0),
-                            'prefill_throughput': metrics.get('prefill_throughput (tokens/s)', 0),
-                            'decode_throughput': metrics.get('decode_throughput (tokens/s)', 0),
-                            'decode_duration': metrics.get('decode_duration (s)', 0),
-                            'input_token': metrics.get('input_token', 0),
-                            'new_token': metrics.get('new_token', 0),
-                            'total_token': metrics.get('total_token', 0),
-                        }
-                        results.append(result)
-                        
-                        # Save result to database
-                        ctx.obj.benchmark_db.save_result(model_name, result, run_id)
-                        
-                    except Exception as e:
-                        console.print(f"\n[yellow]Error in run {r+1}: {e}[/yellow]")
-                        continue
-                    
-                    progress.advance(task)
-    
-    # Display results
-    console.print("\n")
-    
-    if not results:
-        console.print("[red]No benchmark results collected![/red]")
-        ctx.exit(1)
-    
-    
-    
-    
-    model_path_name = Path(model_path).name
-    console.print(f"\n[blue]{model_path_name}[/blue]\n")
-
-    # Create results table with visible lines
-    results_table = Table(show_header=True, header_style="bold")
-    results_table.add_column("[cyan]run[/cyan]", justify="right")
-    results_table.add_column("[cyan]p[/cyan]", justify="right")
-    results_table.add_column("[cyan]n[/cyan]", justify="right")
-    results_table.add_column("[cyan]ttft(s)[/cyan]", justify="right")
-    results_table.add_column("[cyan]tpot(ms)[/cyan]", justify="right")
-    results_table.add_column("[cyan]prefill(t/s)[/cyan]", justify="right")
-    results_table.add_column("[cyan]decode(t/s)[/cyan]", justify="right")
-    results_table.add_column("[cyan]duration(s)[/cyan]", justify="right")
-
-    
-    for result in results:
-        results_table.add_row(
-            str(result['run']),
-            str(result['p']),
-            str(result['n']),
-            f"{result['ttft']:.2f}",
-            f"{result['tpot']:.2f}",
-            f"{result['prefill_throughput']:.1f}",
-            f"{result['decode_throughput']:.1f}",
-            f"{result['decode_duration']:.2f}"
-            )
-    
-    console.print(results_table)
-    
-
-    console.print(f"[dim]Total: {len(results)} runs[/dim]")
-
-@cli.group()
-@click.pass_context
-def tool(ctx):
-    """- Utility scripts."""
-    pass
-
-@tool.command('device-props')
-@click.pass_context
-def device_properties(ctx):
-    """
-    - Query OpenVINO device properties for all available devices.
-    """
-    
-    try:
-        from .device_query import DeviceDataQuery
-        console.print("[blue]Querying device data for all devices...[/blue]")
-        device_query = DeviceDataQuery()
-        available_devices = device_query.get_available_devices()
-        
-        console.print(f"\n[green]Available Devices ({len(available_devices)}):[/green]")
-        
-        if not available_devices:
-            console.print("[red]No devices found![/red]")
-            ctx.exit(1)
-        
-        for device in available_devices:
-            # Create a panel for each device
-            properties = device_query.get_device_properties(device)
-            properties_text = "\n".join([f"{key}: {value}" for key, value in properties.items()])
-            
-            panel = Panel(
-                properties_text,
-                title=f"Device: {device}",
-                title_align="left",
-                border_style="blue"
-            )
-            console.print(panel)
-        
-        console.print(f"\n[green]Found {len(available_devices)} device(s)[/green]")
-        
-    except Exception as e:
-        console.print(f"[red]Error querying device data:[/red] {e}")
-        ctx.exit(1)
-
-@tool.command('device-detect')
-@click.pass_context
-def device_detect(ctx):
-    """
-    - Detect available OpenVINO devices.
-    """
-    
-    try:
-        from .device_query import DeviceDiagnosticQuery
-        console.print("[blue]Detecting OpenVINO devices...[/blue]")
-        diagnostic = DeviceDiagnosticQuery()
-        available_devices = diagnostic.get_available_devices()
-        
-        table = Table()
-        table.add_column("Index", style="cyan", width=2)
-        table.add_column("Device", style="green")
-        
-        if not available_devices:
-            console.print("[red] Sanity test failed: No OpenVINO devices found! Maybe check your drivers?[/red]")
-            ctx.exit(1)
-        
-        for i, device in enumerate(available_devices, 1):
-            table.add_row(str(i), device)
-        
-        console.print(table)
-        console.print(f"\n[green] Sanity test passed: found {len(available_devices)} device(s)[/green]")
-            
-    except Exception as e:
-        console.print(f"[red]Sanity test failed: No OpenVINO devices found! Maybe check your drivers?[/red] {e}")
-        ctx.exit(1)
-
-@cli.group()
-def serve():
-    """
-    - Start the OpenArc server.
-    """
-    pass
-
-@serve.command("start")
-@click.option("--host", type=str, default="0.0.0.0", show_default=True,
-              help="""
-              - Host to bind the server to
-              """)
-@click.option("--openarc-port", 
-              type=int, 
-              default=8000, 
-              show_default=True,
-              help="""
-              - Port to bind the server to
-              """)
-@click.option("--load-models", "--lm",
-              required=False,
-              help="Load models on startup. Specify once followed by space-separated model names.")
-@click.argument('startup_models', nargs=-1, required=False)
-@click.pass_context
-def start(ctx, host, openarc_port, load_models, startup_models):
-    """
-    - 'start' reads --host and --openarc-port from config or defaults to 0.0.0.0:8000
-    
-    Examples:
-        openarc serve start
-        openarc serve start --load-models model1 model2
-        openarc serve start --lm Dolphin-X1 kokoro whisper
-    """
-    # Save server configuration for other CLI commands to use
-    config_path = ctx.obj.server_config.save_server_config(host, openarc_port)
-    console.print(f"[dim]Configuration saved to: {config_path}[/dim]")
-    
-    # Handle startup models
-    models_to_load = []
-    if load_models:
-        models_to_load.append(load_models)
-    if startup_models:
-        models_to_load.extend(startup_models)
-    
-    if models_to_load:
-        saved_model_names = ctx.obj.server_config.get_model_names()
-        missing = [m for m in models_to_load if m not in saved_model_names]
-        
-        if missing:
-            console.print("[yellow]Warning: Models not in config (will be skipped):[/yellow]")
-            for m in missing:
-                console.print(f"   • {m}")
-            console.print("[dim]Use 'openarc list' to see saved configurations.[/dim]\n")
-        
-        os.environ["OPENARC_STARTUP_MODELS"] = ",".join(models_to_load)
-        console.print(f"[blue]Models to load on startup:[/blue] {', '.join(models_to_load)}\n")
-    
-    console.print(f"[green]Starting OpenArc server on {host}:{openarc_port}[/green]")
-    from .launch_server import start_server
-    start_server(host=host, openarc_port=openarc_port)
-
-
-if __name__ == "__main__":
-    cli()
-
-
-
diff --git a/src/cli/utils.py b/src/cli/utils.py
new file mode 100644
index 0000000..bc5dcc3
--- /dev/null
+++ b/src/cli/utils.py
@@ -0,0 +1,42 @@
+"""
+Utility functions for OpenArc CLI.
+"""
+from pathlib import Path
+
+
+def validate_model_path(model_path):
+    """
+    Validate that model_path contains OpenVINO model files.
+    Checks for at least one file with "_model.bin" and one file with "_model.xml" in filename.
+    Returns True if valid, False otherwise.
+    """
+    path = Path(model_path)
+    
+    # Resolve the path
+    if not path.exists():
+        return False
+    
+    # Determine search directory - if path is a file, use its parent; if directory, use it
+    if path.is_file():
+        search_dir = path.parent
+    else:
+        search_dir = path
+    
+    # Check for required files
+    has_bin = False
+    has_xml = False
+    
+    try:
+        for file_path in search_dir.rglob("*"):
+            if file_path.is_file():
+                filename = file_path.name
+                if "_model.bin" in filename:
+                    has_bin = True
+                if "_model.xml" in filename:
+                    has_xml = True
+                if has_bin and has_xml:
+                    return True
+    except (OSError, PermissionError):
+        return False
+    
+    return False
diff --git a/src/server/main.py b/src/server/main.py
index 0414ae6..6f6558a 100644
--- a/src/server/main.py
+++ b/src/server/main.py
@@ -271,6 +271,7 @@ async def openai_list_models():
 @app.post("/v1/chat/completions", dependencies=[Depends(verify_api_key)])
 async def openai_chat_completions(request: OpenAIChatCompletionRequest, raw_request: Request):
     try:
+        logger.info(f'"{request.model}" request received')
     
         config_kwargs = {
             "messages": request.messages,
@@ -461,6 +462,7 @@ async def event_stream() -> AsyncIterator[bytes]:
 @app.post("/v1/completions", dependencies=[Depends(verify_api_key)])
 async def openai_completions(request: OpenAICompletionRequest, raw_request: Request):
     try:
+        logger.info(f'"{request.model}" request received')
         prompt = request.prompt if isinstance(request.prompt, str) else request.prompt[0]
         
         config_kwargs = {
@@ -599,6 +601,7 @@ async def openai_audio_transcriptions(
     temperature: Optional[float] = Form(0.0, description="Sampling temperature")
 ):
     try:
+        logger.info(f'"{model}" request received')
         # Read the uploaded audio file
         audio_bytes = await file.read()
 
@@ -641,6 +644,7 @@ async def openai_audio_speech(request: OpenAIKokoroRequest):
     Returns a WAV file containing the synthesized speech.
     """
     try:
+        logger.info(f'"{request.model}" request received')
 
         gen_config = OV_KokoroGenConfig(
             kokoro_message=request.input,
@@ -674,6 +678,7 @@ async def openai_audio_speech(request: OpenAIKokoroRequest):
 async def embeddings(request: EmbeddingsRequest):
 
     try:
+        logger.info(f'"{request.model}" request received')
 
         tok_config = PreTrainedTokenizerConfig(
             text=request.input
@@ -731,6 +736,7 @@ async def embeddings(request: EmbeddingsRequest):
 async def rerank(request: RerankRequest):
 
     try:
+        logger.info(f'"{request.model}" request received')
         config_data = {"query": request.query, "documents": request.documents}
         if request.prefix is not None:
             config_data["prefix"] = request.prefix
diff --git a/src/server/worker_registry.py b/src/server/worker_registry.py
index f46c2fd..5797f12 100644
--- a/src/server/worker_registry.py
+++ b/src/server/worker_registry.py
@@ -287,23 +287,23 @@ class QueueWorker:
     @staticmethod
     async def queue_worker_llm(model_name: str, model_queue: asyncio.Queue, llm_model: OVGenAI_LLM, registry: ModelRegistry):
         """Text model inference worker that processes packets from queue"""
-        logger.info(f"[{model_name} LLM Worker] Started, waiting for packets...")
+        logger.info(f"[LLM Worker: {model_name}] Started, waiting for packets...")
         while True:
             packet = await model_queue.get()
             if packet is None:
-                logger.info(f"[{model_name} LLM Worker] Shutdown signal received.")
+                logger.info(f"[LLM Worker: {model_name}] Shutdown signal received.")
                 break
 
             completed_packet = await InferWorker.infer_llm(packet, llm_model)
 
             # Check if inference failed and trigger model unload
             if completed_packet.response and completed_packet.response.startswith("Error:"):
-                logger.error(f"[{model_name} LLM Worker] Inference failed, triggering model unload...")
+                logger.error(f"[LLM Worker: {model_name}] Inference failed, triggering model unload...")
                 asyncio.create_task(registry.register_unload(model_name))
                 break
 
             if completed_packet.metrics:
-                logger.info(f"[{model_name} LLM Worker] Metrics: {completed_packet.metrics}")
+                logger.info(f"[LLM Worker: {model_name}] Metrics: {completed_packet.metrics}")
 
             if packet.result_future is not None and not packet.result_future.done():
                 packet.result_future.set_result(completed_packet)
@@ -313,23 +313,23 @@ async def queue_worker_llm(model_name: str, model_queue: asyncio.Queue, llm_mode
     @staticmethod
     async def queue_worker_vlm(model_name: str, model_queue: asyncio.Queue, vlm_model: OVGenAI_VLM, registry: ModelRegistry):
         """Image model inference worker that processes packets from queue"""
-        logger.info(f"[{model_name} VLM Worker] Started, waiting for packets...")
+        logger.info(f"[VLM Worker: {model_name}] Started, waiting for packets...")
         while True:
             packet = await model_queue.get()
             if packet is None:
-                logger.info(f"[{model_name} VLM Worker] Shutdown signal received.")
+                logger.info(f"[VLM Worker: {model_name}] Shutdown signal received.")
                 break
 
             completed_packet = await InferWorker.infer_vlm(packet, vlm_model)
 
             # Check if inference failed and trigger model unload
             if completed_packet.response and completed_packet.response.startswith("Error:"):
-                logger.error(f"[{model_name} VLM Worker] Inference failed, triggering model unload...")
+                logger.error(f"[VLM Worker: {model_name}] Inference failed, triggering model unload...")
                 asyncio.create_task(registry.register_unload(model_name))
                 break
 
             if completed_packet.metrics:
-                logger.info(f"[{model_name} VLM Worker] Metrics: {completed_packet.metrics}")
+                logger.info(f"[VLM Worker: {model_name}] Metrics: {completed_packet.metrics}")
 
             if packet.result_future is not None and not packet.result_future.done():
                 packet.result_future.set_result(completed_packet)
@@ -339,23 +339,23 @@ async def queue_worker_vlm(model_name: str, model_queue: asyncio.Queue, vlm_mode
     @staticmethod
     async def queue_worker_whisper(model_name: str, model_queue: asyncio.Queue, whisper_model: OVGenAI_Whisper, registry: ModelRegistry):
         """Whisper model inference worker that processes packets from queue"""
-        logger.info(f"[{model_name} Whisper Worker] Started, waiting for packets...")
+        logger.info(f"[Whisper Worker: {model_name}] Started, waiting for packets...")
         while True:
             packet = await model_queue.get()
             if packet is None:
-                logger.info(f"[{model_name} Whisper Worker] Shutdown signal received.")
+                logger.info(f"[Whisper Worker: {model_name}] Shutdown signal received.")
                 break
 
             completed_packet = await InferWorker.infer_whisper(packet, whisper_model)
 
             # Check if inference failed and trigger model unload
             if completed_packet.response and completed_packet.response.startswith("Error:"):
-                logger.error(f"[{model_name} Whisper Worker] Inference failed, triggering model unload...")
+                logger.error(f"[Whisper Worker: {model_name}] Inference failed, triggering model unload...")
                 asyncio.create_task(registry.register_unload(model_name))
                 break
 
             if completed_packet.metrics:
-                logger.info(f"[{model_name} Whisper Worker] Metrics: {completed_packet.metrics}")
+                logger.info(f"[Whisper Worker: {model_name}] Metrics: {completed_packet.metrics}")
 
             if packet.result_future is not None and not packet.result_future.done():
                 packet.result_future.set_result(completed_packet)
@@ -365,25 +365,25 @@ async def queue_worker_whisper(model_name: str, model_queue: asyncio.Queue, whis
     @staticmethod
     async def queue_worker_kokoro(model_name: str, model_queue: asyncio.Queue, kokoro_model: OV_Kokoro, registry: ModelRegistry):
         """Kokoro model inference worker that processes packets from queue"""
-        logger.info(f"[{model_name} Kokoro Worker] Started, waiting for packets...")
+        logger.info(f"[Kokoro Worker: {model_name}] Started, waiting for packets...")
         while True:
             packet = await model_queue.get()
             if packet is None:
-                logger.info(f"[{model_name} Kokoro Worker] Shutdown signal received.")
+                logger.info(f"[Kokoro Worker: {model_name}] Shutdown signal received.")
                 break
 
             completed_packet = await InferWorker.infer_kokoro(packet, kokoro_model)
 
             # Check if inference failed and trigger model unload
             if completed_packet.response and completed_packet.response.startswith("Error:"):
-                logger.error(f"[{model_name} Kokoro Worker] Inference failed, triggering model unload...")
+                logger.error(f"[Kokoro Worker: {model_name}] Inference failed, triggering model unload...")
                 asyncio.create_task(registry.register_unload(model_name))
                 break
 
             # Log the text that was converted to speech
             
             if completed_packet.metrics:
-                logger.info(f"[{model_name} Kokoro Worker] Metrics: {completed_packet.metrics}")
+                logger.info(f"[Kokoro Worker: {model_name}] Metrics: {completed_packet.metrics}")
 
             if packet.result_future is not None and not packet.result_future.done():
                 packet.result_future.set_result(completed_packet)
@@ -393,21 +393,21 @@ async def queue_worker_kokoro(model_name: str, model_queue: asyncio.Queue, kokor
     @staticmethod
     async def queue_worker_emb(model_name: str, model_queue: asyncio.Queue, emb_model: Optimum_EMB, registry: ModelRegistry):
         """EMB model inference worker that processes packets from queue"""
-        logger.info(f"[{model_name} EMB Worker] Started, waiting for packets...")
+        logger.info(f"[EMB Worker: {model_name}] Started, waiting for packets...")
         while True:
             packet = await model_queue.get()
             if packet is None:
-                logger.info(f"[{model_name} EMB Worker] Shutdown signal received.")
+                logger.info(f"[EMB Worker: {model_name}] Shutdown signal received.")
                 break
 
             completed_packet = await InferWorker.infer_emb(packet, emb_model)
             # Check if inference failed and trigger model unload
             if not completed_packet.response:
-                logger.error(f"[{model_name} EMB Worker] Inference failed, triggering model unload...")
+                logger.error(f"[EMB Worker: {model_name}] Inference failed, triggering model unload...")
                 asyncio.create_task(registry.register_unload(model_name))
                 break
             if completed_packet.metrics:
-                logger.info(f"[{model_name} LLM Worker] Metrics: {completed_packet.metrics}")
+                logger.info(f"[EMB Worker: {model_name}] Metrics: {completed_packet.metrics}")
             if packet.result_future is not None and not packet.result_future.done():
                 packet.result_future.set_result(completed_packet)
             model_queue.task_done()
@@ -415,21 +415,21 @@ async def queue_worker_emb(model_name: str, model_queue: asyncio.Queue, emb_mode
     @staticmethod
     async def queue_worker_rr(model_name: str, model_queue: asyncio.Queue, rr_model: Optimum_RR, registry: ModelRegistry):
         """Reranker model inference worker that processes packets from queue"""
-        logger.info(f"[{model_name} Reranker Worker] Started, waiting for packets...")
+        logger.info(f"[Reranker Worker: {model_name}] Started, waiting for packets...")
         while True:
             packet = await model_queue.get()
             if packet is None:
-                logger.info(f"[{model_name} Reranker Worker] Shutdown signal received.")
+                logger.info(f"[Reranker Worker: {model_name}] Shutdown signal received.")
                 break
 
             completed_packet = await InferWorker.infer_rerank(packet, rr_model)
             # Check if inference failed and trigger model unload
             if not completed_packet.response:
-                logger.error(f"[{model_name} Reranker Worker] Inference failed, triggering model unload...")
+                logger.error(f"[Reranker Worker: {model_name}] Inference failed, triggering model unload...")
                 asyncio.create_task(registry.register_unload(model_name))
                 break
             if completed_packet.metrics:
-                logger.info(f"[{model_name} Reranker Worker] Metrics: {completed_packet.metrics}")
+                logger.info(f"[Reranker Worker: {model_name}] Metrics: {completed_packet.metrics}")
             if packet.result_future is not None and not packet.result_future.done():
                 packet.result_future.set_result(completed_packet)
             model_queue.task_done()