Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
353 changes: 204 additions & 149 deletions README.md

Large diffs are not rendered by default.

31 changes: 31 additions & 0 deletions make_gif.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/bin/bash

# Exit immediately if a command exits with a non-zero status.
set -e

# Use ImageMagick convert to create the GIF

echo "Generating GIF using ImageMagick convert..."

# -delay: Time between frames in ticks (1/100ths of a second). 67 ticks = 0.67s (~1.5 fps).
# -loop 0: Loop infinitely.
# List all input PNGs in the desired order.
# -resize '800x>': Resize width to 800px max, maintain aspect ratio ONLY if wider. Remove if no resize needed.
# -layers Optimize: Optimize GIF layers (optional, can reduce size).
convert -delay 67 -loop 0 \
demo_output_multistep/step_0_state.png \
demo_output_multistep/step_0_highlight.png \
demo_output_multistep/step_1_state.png \
demo_output_multistep/step_1_highlight.png \
demo_output_multistep/step_2_state.png \
demo_output_multistep/step_2_highlight.png \
demo_output_multistep/final_state.png \
-resize '800x>' \
-layers Optimize \
omnimcp_demo.gif

echo "Generated omnimcp_demo.gif"

# --- How to Adjust GIF Speed ---
# - Change the value after `-delay`. Lower number = faster animation.
# - e.g., `-delay 50` (0.5s / 2 fps), `-delay 33` (~0.33s / 3 fps)
5 changes: 5 additions & 0 deletions omnimcp/config.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# omnimcp/config.py

"""Configuration management for OmniMCP."""

import os
Expand All @@ -13,6 +15,9 @@ class OmniMCPConfig(BaseSettings):
# Claude API configuration
ANTHROPIC_API_KEY: Optional[str] = None

# Auto-shutdown OmniParser after 60min inactivity
INACTIVITY_TIMEOUT_MINUTES: int = 60

# OmniParser configuration
OMNIPARSER_URL: Optional[str] = None

Expand Down
2 changes: 2 additions & 0 deletions omnimcp/omnimcp.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# omnimcp/omnimcp.py

"""
OmniMCP: Model Context Protocol for UI Automation through visual understanding.

Expand Down
2 changes: 2 additions & 0 deletions omnimcp/omniparser/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# omnimcp/ominparser/Dockerfile

FROM nvidia/cuda:12.3.1-devel-ubuntu22.04

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
Expand Down
157 changes: 106 additions & 51 deletions omnimcp/omniparser/client.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
# omnimcp/omniparser/client.py

"""Client module for interacting with the OmniParser server."""

import base64
import time
from typing import Optional, Dict, List

import requests
from loguru import logger
from PIL import Image, ImageDraw
import boto3 # Need boto3 for the initial check
import requests

from server import Deploy
from .server import Deploy
from ..config import config


class OmniParserClient:
Expand All @@ -28,59 +31,111 @@ def __init__(self, server_url: Optional[str] = None, auto_deploy: bool = True):

def _ensure_server(self) -> None:
"""Ensure a server is available, deploying one if necessary."""
if not self.server_url:
# Try to find an existing server
deployer = Deploy()
deployer.status() # This will log any running instances

# Check if any instances are running
import boto3

ec2 = boto3.resource("ec2")
instances = ec2.instances.filter(
Filters=[
{"Name": "tag:Name", "Values": ["omniparser"]},
{"Name": "instance-state-name", "Values": ["running"]},
]
)

instance = next(iter(instances), None)
if instance and instance.public_ip_address:
self.server_url = f"http://{instance.public_ip_address}:8000"
logger.info(f"Found existing server at {self.server_url}")
elif self.auto_deploy:
logger.info("No server found, deploying new instance...")
deployer.start()
# Wait for deployment and get URL
max_retries = 30
retry_delay = 10
for i in range(max_retries):
instances = ec2.instances.filter(
Filters=[
{"Name": "tag:Name", "Values": ["omniparser"]},
{"Name": "instance-state-name", "Values": ["running"]},
]
if self.server_url:
logger.info(f"Using provided server URL: {self.server_url}")
else:
logger.info("No server_url provided, attempting discovery/deployment...")
# Try finding existing running instance first
instance_ip = None
instance_id = None
try:
ec2 = boto3.resource("ec2", region_name=config.AWS_REGION)
instances = ec2.instances.filter(
Filters=[
{
"Name": "tag:Name",
"Values": [config.PROJECT_NAME],
}, # Use project name tag
{"Name": "instance-state-name", "Values": ["running"]},
]
)
# Get the most recently launched running instance
running_instances = sorted(
list(instances), key=lambda i: i.launch_time, reverse=True
)
instance = running_instances[0] if running_instances else None

if instance and instance.public_ip_address:
instance_ip = instance.public_ip_address
instance_id = instance.id # Store ID too for logging maybe
self.server_url = f"http://{instance_ip}:{config.PORT}"
logger.success(
f"Found existing running server instance {instance_id} at {self.server_url}"
)
elif self.auto_deploy:
logger.info(
"No running server found, attempting auto-deployment via Deploy.start()..."
)
# Call start and get the result directly
deployer = Deploy()
# Deploy.start now returns IP and ID
instance_ip, instance_id = deployer.start()

if instance_ip and instance_id:
# Deployment succeeded, set the URL
self.server_url = f"http://{instance_ip}:{config.PORT}"
logger.success(
f"Auto-deployment successful. Server URL: {self.server_url} (Instance ID: {instance_id})"
)
else:
# deployer.start() failed and returned None
raise RuntimeError(
"Auto-deployment failed (Deploy.start did not return valid IP/ID). Check server logs."
)
else: # No running instance and auto_deploy is False
raise RuntimeError(
"No server URL provided, no running instance found, and auto_deploy is disabled."
)
instance = next(iter(instances), None)
if instance and instance.public_ip_address:
self.server_url = f"http://{instance.public_ip_address}:8000"
break
time.sleep(retry_delay)
else:
raise RuntimeError("Failed to deploy server")
else:
raise RuntimeError("No server URL provided and auto_deploy is disabled")

# Verify server is responsive
self._check_server()

except Exception as e:
logger.error(
f"Error during server discovery/deployment: {e}", exc_info=True
)
# Re-raise as a RuntimeError to be caught by the main script if needed
raise RuntimeError(f"Server discovery/deployment failed: {e}") from e

# Verify server is responsive (only if server_url is now set)
if self.server_url:
logger.info(f"Checking server responsiveness at {self.server_url}...")
try:
self._check_server() # This probes the URL
logger.success(f"Server at {self.server_url} is responsive.")
except Exception as check_err:
logger.error(f"Server check failed for {self.server_url}: {check_err}")
# Raise error - if we have a URL it should be responsive after deployment/discovery
raise RuntimeError(
f"Server at {self.server_url} failed responsiveness check."
) from check_err
else:
# Safety check - should not be reachable if logic above is correct
raise RuntimeError("Critical error: Failed to obtain server URL.")

def _check_server(self) -> None:
"""Check if the server is responsive."""
if not self.server_url:
raise RuntimeError(
"Cannot check server responsiveness, server_url is not set."
)
try:
response = requests.get(f"{self.server_url}/probe/", timeout=10)
response.raise_for_status()
except Exception as e:
raise RuntimeError(f"Server not responsive: {e}")
# Increased timeout slightly
response = requests.get(f"{self.server_url}/probe/", timeout=15)
response.raise_for_status() # Raises HTTPError for bad responses (4xx or 5xx)
# Check content if needed: assert response.json().get("message") == "..."
except requests.exceptions.Timeout:
logger.error(
f"Timeout connecting to server probe endpoint: {self.server_url}/probe/"
)
raise RuntimeError(f"Server probe timed out for {self.server_url}")
except requests.exceptions.ConnectionError:
logger.error(
f"Connection error reaching server probe endpoint: {self.server_url}/probe/"
)
raise RuntimeError(f"Server probe connection error for {self.server_url}")
except requests.exceptions.RequestException as e:
logger.error(
f"Error during server probe request for {self.server_url}: {e}"
)
raise RuntimeError(f"Server probe failed: {e}") from e

def parse_image(self, image: Image.Image) -> Dict:
"""Parse an image using the OmniParser server.
Expand Down
Loading