Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 61 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ droidworld check
Execute from `droidrun-android-world` directory:
```bash
# Example: add contact task
droidworld run --tasks ContactsAddContact
droidworld run --task ContactsAddContact
```

---
Expand Down Expand Up @@ -179,6 +179,12 @@ Run a specific task by name:
droidworld run --task ContactsAddContact
```

Run multiple specific tasks:

```bash
droidworld run --task ContactsAddContact --task ContactsDeleteContact
```

### List Available Tasks

View all available tasks with their IDs:
Expand All @@ -189,17 +195,66 @@ droidworld list-tasks

### Customizing the Benchmark

#### LLM Provider Configuration

```bash
# Run with a different LLM provider and model
droidworld run --llm-provider Anthropic --llm-model claude-3-sonnet-20240229
# Use Anthropic Claude
droidworld run --task ContactsAddContact \
--llm-provider Anthropic \
--llm-model claude-3-sonnet-20240229

# Use OpenAI-compatible API (e.g., third-party proxy)
droidworld run --task ContactsAddContact \
--llm-provider OpenAILike \
--llm-model gemini-2.5-pro \
--api-base http://your-api-endpoint/v1

# Enable vision and reasoning modes
droidworld run --task ContactsAddContact \
--vision \
--reasoning
```

#### Task Family Selection

Choose from different task families:
- `android_world` (default): Full Android World task suite
- `android`: Android-specific tasks
- `miniwob`: MiniWoB tasks
- `information_retrieval`: Information retrieval tasks

```bash
droidworld run --task-family android --min-task-idx 0 --max-task-idx 5
```

#### Performance Tuning

```bash
# Set maximum steps per task: multiplier * task complexity
droidworld run --max-step-multiplier 15
droidworld run --task ContactsAddContact --max-steps-multiplier 15

# Set timeout: multiplier (in seconds) per task
droidworld run --task ContactsAddContact --timeout-multiplier 300

# Adjust LLM temperature
droidworld run --task ContactsAddContact --temperature 0.7
```

#### Advanced Options

```bash
# Run multiple parameter combinations per task
droidworld run --n-task-combinations 3
droidworld run --task ContactsAddContact --n-task-combinations 3

# Enable debug mode and tracing
droidworld run --task ContactsAddContact --debug --tracing

# Use custom environment URL and device serial
droidworld run --task ContactsAddContact \
--env-url http://localhost:5001 \
--env-serial emulator-5554

# Check all available configuration options with
# Check all available configuration options
droidworld run --help
```

Expand Down
2 changes: 1 addition & 1 deletion droidrun
Submodule droidrun updated 149 files
60 changes: 41 additions & 19 deletions eval/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from eval.env.boot import boot_environment
from eval.runner import run_task_on_env
from eval.tracker import write_task_result
from eval.portal.keepalive import disable_overlay_once
from droidrun.portal import toggle_overlay
from droidrun import load_llm, __version__ as droidrun_version
from android_world import __version__ as android_world_version
from adbutils import adb
Expand Down Expand Up @@ -67,7 +67,7 @@ def version():
@cli.command()
@click.option(
"--env-url",
default="http://localhost:5000",
default="http://localhost:5001",
help="Android World Environment URL to use.",
)
def list_tasks(env_url):
Expand All @@ -81,14 +81,15 @@ def list_tasks(env_url):
@cli.command()
@click.option(
"--env-url",
default="http://localhost:5000",
default="http://localhost:5001",
help="Android World Environment URL to use.",
)
@click.option("--env-serial", default="emulator-5554", help="Device serial to use.")
def check(env_url, env_serial):
import asyncio
env = AndroidEnvClient(env_url)
try:
boot_environment(env, env_serial)
asyncio.run(boot_environment(env, env_serial))
logger.info("Environment is healthy")
except Exception as e:
logger.error(f"Error booting environment: {e}")
Expand All @@ -100,7 +101,7 @@ def check(env_url, env_serial):
def disable_overlay(env_serial):
try:
device = adb.device(env_serial)
disable_overlay_once(device)
toggle_overlay(device, False)
logger.info("Overlay disabled")
except Exception as e:
logger.error(f"Error disabling overlay: {e}")
Expand All @@ -110,7 +111,7 @@ def disable_overlay(env_serial):
@cli.command()
@click.option(
"--env-url",
default="http://localhost:5000",
default="http://localhost:5001",
help="Android World Environment URL to use.",
)
@click.option("--env-serial", default="emulator-5554", help="Device serial to use.")
Expand All @@ -122,14 +123,16 @@ def disable_overlay(env_serial):
@click.option(
"--n-task-combinations", "-n", default=1, help="Number of task combinations."
)
@click.option("--llm-provider", default="Gemini", help="LLM provider to use.")
@click.option("--llm-model", default="gemini-2.5-pro", help="LLM model to use.")
@click.option("--vision", is_flag=True, help="Enable vision.")
@click.option("--reasoning", is_flag=True, help="Enable reasoning.")
@click.option("--reflection", is_flag=True, help="Enable reflection.")
@click.option("--debug", is_flag=True, help="Enable debug mode.")
@click.option("--temperature", default=0.5, help="Temperature to use.")
@click.option("--tracing", is_flag=True, help="Enable tracing.")
@click.option("--config", "-c", default=None, help="Path to DroidRun config.yaml file. If provided, CLI LLM parameters will be ignored.")
@click.option("--llm-provider", default="Gemini", help="LLM provider to use (ignored if --config is provided).")
@click.option("--llm-model", default="gemini-2.5-pro", help="LLM model to use (ignored if --config is provided).")
@click.option("--api-base", default=None, help="Base URL for API (ignored if --config is provided).")
@click.option("--vision", is_flag=True, help="Enable vision (ignored if --config is provided).")
@click.option("--reasoning", is_flag=True, help="Enable reasoning (ignored if --config is provided).")
@click.option("--reflection", is_flag=True, help="Enable reflection (ignored if --config is provided).")
@click.option("--debug", is_flag=True, help="Enable debug mode (ignored if --config is provided).")
@click.option("--temperature", default=0.5, help="Temperature to use (ignored if --config is provided).")
@click.option("--tracing", is_flag=True, help="Enable tracing (ignored if --config is provided).")
@click.option("--max-steps-multiplier", default=15, help="Max steps multiplier.")
@click.option("--timeout-multiplier", default=300, help="Timeout multiplier.")
@make_sync
Expand All @@ -142,8 +145,10 @@ async def run(
max_task_idx,
task,
n_task_combinations,
config,
llm_provider,
llm_model,
api_base,
vision,
reasoning,
reflection,
Expand All @@ -156,7 +161,7 @@ async def run(
env = AndroidEnvClient(env_url)

try:
boot_environment(env, env_serial)
await boot_environment(env, env_serial)
except Exception as e:
logger.error(f"Error booting environment: {e}")
logger.info(
Expand Down Expand Up @@ -184,17 +189,33 @@ async def run(

logger.info(f"Found tasks: {', '.join(task_list)} ({len(task_list)})")

logger.debug(f"Loading LLM: {llm_provider} {llm_model} {temperature}")
llm = load_llm(llm_provider, model=llm_model, temperature=temperature)
logger.debug("LLM loaded successfully")
# Load config or use CLI parameters
droidrun_config = None
llm = None

if config:
# Use config file - load DroidrunConfig
from droidrun.config_manager import DroidrunConfig
logger.info(f"Loading DroidRun config from: {config}")
droidrun_config = DroidrunConfig.from_yaml(config)
logger.info("Config loaded successfully. CLI LLM parameters will be ignored.")
else:
# Use CLI parameters - load single LLM
logger.debug(f"Loading LLM from CLI: {llm_provider} {llm_model} {temperature}")
llm_kwargs = {"model": llm_model, "temperature": temperature}
if api_base:
llm_kwargs["api_base"] = api_base
logger.debug(f"Using custom API base: {api_base}")
llm = load_llm(llm_provider, **llm_kwargs)
logger.debug("LLM loaded successfully from CLI parameters")

for task_name in task_list:
task_id = all_tasks.index(task_name)
num_tasks = env.get_suite_task_length(task_name)

for task_idx in range(num_tasks):
try:
boot_environment(env, env_serial)
await boot_environment(env, env_serial)
except Exception as e:
logger.error(f"Error booting environment: {e}")
logger.info(
Expand All @@ -217,6 +238,7 @@ async def run(
reflection,
tracing,
debug,
droidrun_config=droidrun_config,
)
if e:
logger.error(f"Error running task {task_name} {task_idx}: {e}")
Expand Down
51 changes: 29 additions & 22 deletions eval/env/boot.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
setup_keyboard,
A11Y_SERVICE_NAME as DROIDRUN_A11Y_SERVICE_NAME,
)
from adbutils import adb, AdbDevice
from async_adbutils import adb, AdbDevice
import time

logger = logging.getLogger(__name__)
Expand All @@ -22,30 +22,37 @@
)
DEFAULT_OVERLAY_OFFSET = -126

def ensure_connected(serial: str) -> AdbDevice:
async def ensure_connected(serial: str) -> AdbDevice:
try:
res = adb.connect(serial)
if res.count("failed") > 0 or res.count("unable") > 0:
raise res
# For emulator devices (emulator-*), they're already connected locally
# Only try network connect for IP addresses
if ":" in serial or not serial.startswith("emulator-"):
res = adb.connect(serial)
if res.count("failed") > 0 or res.count("unable") > 0:
raise RuntimeError(f"Failed to connect: {res}")

# Verify device is available
device = await adb.device(serial)
# Test if device is accessible
await device.shell("echo test")
return device
except Exception as e:
raise RuntimeError(f"Device {serial} is not connected: {e}")

return adb.device(serial)


def install_portal(device: AdbDevice):
async def install_portal(device: AdbDevice):
logger.info("Installing portal...")

try:
with download_portal_apk() as apk_path:
device.install(apk_path, uninstall=True, flags=["-g"], silent=False)
await device.install(apk_path, uninstall=True, flags=["-g"], silent=False)
logger.info("Portal APK installed successfully")
except Exception as e:
raise RuntimeError(f"Failed to download and install portal APK: {e}")

try:
logger.info("Enabling portal as accessibility service...")
enable_portal_accessibility(
await enable_portal_accessibility(
device, service_name=DROIDRUN_X_GOOGLE_A11Y_SERVICE_NAME
)
logger.info("Portal accessibility enabled successfully")
Expand All @@ -54,36 +61,36 @@ def install_portal(device: AdbDevice):

try:
logger.info("Setting up keyboard for environment...")
setup_keyboard(device)
await setup_keyboard(device)
logger.info("Keyboard setup completed successfully!")
except Exception as e:
raise RuntimeError(f"Failed to setup keyboard: {e}")


def check_portal(device: AdbDevice):
if not check_portal_accessibility(
async def check_portal(device: AdbDevice):
if not await check_portal_accessibility(
device, service_name=DROIDRUN_X_GOOGLE_A11Y_SERVICE_NAME
):
raise RuntimeError("Accessibility settings invalid")

try:
set_overlay_offset(device, DEFAULT_OVERLAY_OFFSET)
await set_overlay_offset(device, DEFAULT_OVERLAY_OFFSET)
logger.info("Overlay offset set successfully")
except Exception as e:
raise RuntimeError(f"Failed to set overlay offset: {e}")

try:
ping_portal(device)
await ping_portal(device)
except Exception as e:
raise RuntimeError(f"Failed to ping portal: {e}")

try:
ping_portal_content(device)
await ping_portal_content(device)
except Exception as e:
raise RuntimeError(f"Failed to ping portal content: {e}")

try:
ping_portal_tcp(device)
await ping_portal_tcp(device)
except Exception as e:
raise RuntimeError(f"Failed to ping portal TCP: {e}")

Expand Down Expand Up @@ -117,7 +124,7 @@ def wait_ready(env: AndroidEnvClient, timeout: int = 300):
)


def boot_environment(env: AndroidEnvClient, serial: str):
async def boot_environment(env: AndroidEnvClient, serial: str):
try:
logger.info(f"Waiting for environment {env.base_url} to be ready...")
wait_ready(env, timeout=600)
Expand All @@ -127,15 +134,15 @@ def boot_environment(env: AndroidEnvClient, serial: str):
raise e

try:
device = ensure_connected(serial)
device = await ensure_connected(serial)
except Exception as e:
logger.error(f"Environment {env.base_url} failed to connect via adb: {e}")
raise e

# check if portal is already installed
try:
logger.info(f"Checking portal for environment {env.base_url}...")
check_portal(device)
await check_portal(device)
logger.info("Portal is installed and accessible. You're good to go!")
return
except Exception as e:
Expand All @@ -145,15 +152,15 @@ def boot_environment(env: AndroidEnvClient, serial: str):

try:
logger.info(f"Installing portal for environment {env.base_url}...")
install_portal(device)
await install_portal(device)
logger.info(f"Portal installed successfully for environment {env.base_url}!")
except Exception as e:
logger.error(f"Environment {env.base_url} failed to install portal: {e}")
raise e

try:
logger.info(f"Checking portal for environment {env.base_url}...")
check_portal(device)
await check_portal(device)
logger.info("Portal is installed and accessible. You're good to go!")
except Exception as e:
logger.error(f"Environment {env.base_url} failed to check portal: {e}")
Expand Down
2 changes: 1 addition & 1 deletion eval/env/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def parse_element(data: dict[str, Any]) -> representation_utils.UIElement:
class AndroidEnvClient:
"""Client for interacting with the Android environment server."""

def __init__(self, base_url: str = "http://localhost:5000"):
def __init__(self, base_url: str = "http://localhost:5001"):
logger.info(
"Setting up Android environment using Docker - Initial setup may take"
" 5-10 minutes. Please wait..."
Expand Down
Loading