diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 60d458f..f7bd27d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -48,3 +48,7 @@ jobs: env: ANTHROPIC_API_KEY: "ci_dummy_key" run: uv run pytest tests/ + + # --- 8. Smoke test cli.lpy + - name: Run CLI Smoke Test (--help) + run: uv run python cli.py --help diff --git a/.gitignore b/.gitignore index f0922a2..2717a74 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,6 @@ omnimcp.egg-info/ omnimcp.log __pycache__ +runs/ +logs/ +images/*/ diff --git a/cli.py b/cli.py index 65d9b5c..fcac9ac 100644 --- a/cli.py +++ b/cli.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python # cli.py """ @@ -10,20 +11,7 @@ import fire -# Import necessary components from the project -from omnimcp.agent_executor import AgentExecutor -from omnimcp.config import config -from omnimcp.core import plan_action_for_ui -from omnimcp.input import InputController, _pynput_error # Check pynput import status -from omnimcp.omniparser.client import OmniParserClient -from omnimcp.omnimcp import VisualState -from omnimcp.utils import ( - logger, - draw_bounding_boxes, - draw_action_highlight, - NSScreen, # Check for AppKit on macOS -) - +from omnimcp.utils import logger # Default configuration DEFAULT_OUTPUT_DIR = "runs" @@ -35,6 +23,7 @@ def run( goal: str = DEFAULT_GOAL, max_steps: int = DEFAULT_MAX_STEPS, output_dir: str = DEFAULT_OUTPUT_DIR, + ci_mode: bool = False, ): """ Runs the OmniMCP agent to achieve a specified goal. @@ -43,9 +32,34 @@ def run( goal: The natural language goal for the agent. max_steps: Maximum number of steps to attempt. output_dir: Base directory to save run artifacts (timestamped subdirs). + ci_mode: Run in CI mode (skips API validation and actual execution). """ # --- Initial Checks --- logger.info("--- OmniMCP CLI ---") + + # Skip import-time checks if we're in CI mode + if ci_mode: + logger.info("Running in CI mode - skipping credential checks and execution") + return 0 + + # Delay imports to avoid credential checks at import time + try: + # Import necessary components from the project + from omnimcp.config import config + from omnimcp.input import InputController, _pynput_error + from omnimcp.agent_executor import AgentExecutor + from omnimcp.core import plan_action_for_ui + from omnimcp.omniparser.client import OmniParserClient + from omnimcp.visual_state import VisualState + from omnimcp.utils import ( + draw_bounding_boxes, + draw_action_highlight, + NSScreen, # Check for AppKit on macOS + ) + except ImportError as e: + logger.critical(f"Required dependency not found: {e}") + return 1 + logger.info("Performing initial checks...") success = True @@ -84,7 +98,7 @@ def run( if not success: logger.error("Prerequisite checks failed. Exiting.") - sys.exit(1) + return 1 # --- Component Initialization --- logger.info("\nInitializing components...") @@ -116,10 +130,10 @@ def run( logger.critical( " Ensure all requirements are installed (`uv pip install -e .`)" ) - sys.exit(1) + return 1 except Exception as e: logger.critical(f"❌ Component initialization failed: {e}", exc_info=True) - sys.exit(1) + return 1 # --- Agent Executor Initialization --- logger.info("\nInitializing Agent Executor...") @@ -134,7 +148,7 @@ def run( logger.success("✅ Agent Executor initialized successfully.") except Exception as e: logger.critical(f"❌ Agent Executor initialization failed: {e}", exc_info=True) - sys.exit(1) + return 1 # --- User Confirmation & Start --- print("\n" + "=" * 60) @@ -159,13 +173,13 @@ def run( ) except KeyboardInterrupt: logger.warning("\nExecution interrupted by user (Ctrl+C).") - sys.exit(1) + return 1 except Exception as run_e: logger.critical( f"\nAn unexpected error occurred during the agent run: {run_e}", exc_info=True, ) - sys.exit(1) + return 1 finally: # Optional: Add cleanup here if needed (e.g., stopping parser server) logger.info( @@ -176,13 +190,20 @@ def run( # --- Exit --- if overall_success: logger.success("\nAgent run finished successfully (goal achieved).") - sys.exit(0) + return 0 else: logger.error( "\nAgent run finished unsuccessfully (goal not achieved or error occurred)." ) - sys.exit(1) + return 1 + + +def main(): + """Main entry point that handles Fire's return code conversion.""" + result = fire.Fire(run) + if isinstance(result, int): + sys.exit(result) if __name__ == "__main__": - fire.Fire(run) + main() diff --git a/omnimcp/__init__.py b/omnimcp/__init__.py index 3cbe851..d5b28ba 100644 --- a/omnimcp/__init__.py +++ b/omnimcp/__init__.py @@ -1,24 +1,45 @@ -# omnimcp/__init__.py - import sys import os - from loguru import logger from omnimcp.config import config -log_dir = "logs" -os.makedirs(log_dir, exist_ok=True) -# Define file path using a format string recognized by loguru's sink -log_file_path = os.path.join(log_dir, "run_{time:YYYY-MM-DD_HH-mm-ss}.log") +# Remove default handler +logger.remove() -logger.remove() # Remove default handler to configure levels precisely -# Log INFO and above to stderr +# Add stderr handler (keep this functionality) logger.add(sys.stderr, level=config.LOG_LEVEL.upper() if config.LOG_LEVEL else "INFO") -# Log DEBUG and above to a rotating file -logger.add( - log_file_path, rotation="50 MB", level="DEBUG", encoding="utf8", enqueue=True -) # enqueue for async safety -logger.info("Logger configured.") -# You might want to set LOG_LEVEL=DEBUG in your .env file now + +# Define a function to configure run-specific logging +def setup_run_logging(run_dir=None): + """ + Configure additional logging for a specific run. + + Args: + run_dir: Directory to store run-specific logs. If None, logs go to default logs directory. + + Returns: + The log file path + """ + # Determine log file location + if run_dir: + os.makedirs(run_dir, exist_ok=True) + log_file_path = os.path.join(run_dir, "run.log") + else: + log_dir = config.LOG_DIR or "logs" + os.makedirs(log_dir, exist_ok=True) + log_file_path = os.path.join(log_dir, "run_{time:YYYY-MM-DD_HH-mm-ss}.log") + + # Add run-specific log handler + logger.add( + log_file_path, rotation="50 MB", level="DEBUG", encoding="utf8", enqueue=True + ) + + logger.info(f"Run logging configured. Log path: {log_file_path}") + return log_file_path + + +# Set up default logging (for non-run use) +if not config.DISABLE_DEFAULT_LOGGING: + setup_run_logging() diff --git a/omnimcp/agent_executor.py b/omnimcp/agent_executor.py index 79d1a8b..c8716b5 100644 --- a/omnimcp/agent_executor.py +++ b/omnimcp/agent_executor.py @@ -8,9 +8,9 @@ from PIL import Image -# Used for type hinting if Protocol is simple: -from .types import LLMActionPlan, UIElement -from .utils import ( +from omnimcp import config, setup_run_logging +from omnimcp.types import LLMActionPlan, UIElement +from omnimcp.utils import ( denormalize_coordinates, draw_action_highlight, draw_bounding_boxes, @@ -194,10 +194,16 @@ def _execute_scroll( # Comparison Note: # This `run` method implements an explicit, sequential perceive-plan-act loop. - # Alternative agent architectures exist... (rest of comment remains same) + # Alternative agent architectures exist, such as: + # - ReAct (Reasoning-Acting): Where the LLM explicitly decides between + # reasoning steps and action steps. + # - Callback-driven: Where UI events or timers might trigger agent actions. + # - More complex state machines or graph-based execution flows. + # This simple sequential loop provides a clear baseline. Future work might explore + # these alternatives for more complex or reactive tasks. def run( - self, goal: str, max_steps: int = 10, output_base_dir: str = "runs" + self, goal: str, max_steps: int = 10, output_base_dir: Optional[str] = None ) -> bool: """ Runs the main perceive-plan-act loop to achieve the goal. @@ -206,16 +212,28 @@ def run( goal: The natural language goal for the agent. max_steps: Maximum number of steps to attempt. output_base_dir: Base directory to save run artifacts (timestamped). + If None, uses config.RUN_OUTPUT_DIR. Returns: True if the goal was achieved, False otherwise (error or max steps reached). """ + + # Use configured output dir if none provided + if output_base_dir is None: + output_base_dir = config.RUN_OUTPUT_DIR + run_timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") run_output_dir = os.path.join(output_base_dir, run_timestamp) + try: os.makedirs(run_output_dir, exist_ok=True) + + # Configure run-specific logging + log_path = setup_run_logging(run_output_dir) + logger.info(f"Starting agent run. Goal: '{goal}'") logger.info(f"Saving outputs to: {run_output_dir}") + logger.info(f"Run log file: {log_path}") except OSError as e: logger.error(f"Failed to create output directory {run_output_dir}: {e}") return False diff --git a/omnimcp/completions.py b/omnimcp/completions.py index d57f263..c2bcada 100644 --- a/omnimcp/completions.py +++ b/omnimcp/completions.py @@ -32,7 +32,7 @@ ) MAX_RETRIES = 3 -DEFAULT_MODEL = "claude-3-7-sonnet-20250219" +DEFAULT_MODEL = config.ANTHROPIC_DEFAULT_MODEL or "claude-3-7-sonnet-20250219" @retry( diff --git a/omnimcp/config.py b/omnimcp/config.py index 29e5eed..0f74f06 100644 --- a/omnimcp/config.py +++ b/omnimcp/config.py @@ -14,6 +14,8 @@ class OmniMCPConfig(BaseSettings): # Claude API configuration ANTHROPIC_API_KEY: Optional[str] = None + ANTHROPIC_DEFAULT_MODEL: str = "claude-3-7-sonnet-20250219" + # ANTHROPIC_DEFAULT_MODEL: str = "claude-3-haiku-20240307" # Auto-shutdown OmniParser after 60min inactivity INACTIVITY_TIMEOUT_MINUTES: int = 60 @@ -29,13 +31,25 @@ class OmniMCPConfig(BaseSettings): # OmniParser deployment configuration PROJECT_NAME: str = "omniparser" REPO_URL: str = "https://github.com/microsoft/OmniParser.git" - AWS_EC2_AMI: str = "ami-06835d15c4de57810" + # AWS_EC2_AMI: str = "ami-06835d15c4de57810" + AWS_EC2_AMI: str = ( + "ami-04631c7d8811d9bae" # Official AWS DLAMI Base Ubuntu 22.04 (G6 Compatible) + ) AWS_EC2_DISK_SIZE: int = 128 # GB - AWS_EC2_INSTANCE_TYPE: str = "g4dn.xlarge" # (T4 16GB $0.526/hr x86_64) + # AWS_EC2_INSTANCE_TYPE: str = "g4dn.xlarge" # (T4 16GB $0.526/hr x86_64) + AWS_EC2_INSTANCE_TYPE: str = "g6.xlarge" # (L4 24GB $0.805/hr x86_64) + # AWS_EC2_INSTANCE_TYPE: str = "p3.2xlarge" # (V100 16GB $3.06/hr x86_64) AWS_EC2_USER: str = "ubuntu" PORT: int = 8000 # FastAPI port COMMAND_TIMEOUT: int = 600 # 10 minutes + # Logging configuration + LOG_DIR: Optional[str] = "logs" + DISABLE_DEFAULT_LOGGING: bool = False + + # Run output configuration + RUN_OUTPUT_DIR: str = "runs" + # Debug settings # DEBUG: bool = False LOG_LEVEL: str = "INFO"