Skip to content
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 23 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ uv run benchmarks/swe_bench/build_images.py \
```


### 3. Run SWE-Bench Evaluation
### 3. Run SWE-Bench Inference
```bash
# Run evaluation with your configured LLM
uv run swebench-infer .llm_config/example.json \
Expand Down Expand Up @@ -134,6 +134,28 @@ python -m benchmarks.swe_bench.run_infer \

This will only evaluate the instances listed in the file.

### 5. Evaluate SWE-Bench Results
After running inference, evaluate the results using the official SWE-Bench evaluation:

```bash
# Convert output format and run SWE-Bench evaluation
uv run swebench-eval output.jsonl

# Or specify custom dataset and output file
uv run swebench-eval output.jsonl --dataset princeton-nlp/SWE-bench_Lite --output-file results.swebench.jsonl

# Only convert format without running evaluation
uv run swebench-eval output.jsonl --skip-evaluation

# Install SWE-Bench if needed
uv run swebench-eval output.jsonl --install-swebench
```

The script will:
1. Convert OpenHands output format to SWE-Bench prediction format
2. Install SWE-Bench if not already available (optional)
3. Run the official SWE-Bench evaluation harness

## Links

- **Original OpenHands**: https://github.com/All-Hands-AI/OpenHands/
Expand Down
293 changes: 293 additions & 0 deletions benchmarks/swe_bench/eval_infer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,293 @@
#!/usr/bin/env python3
"""
SWE-Bench Evaluation Script

This script converts OpenHands output.jsonl format to SWE-Bench prediction format
and runs the SWE-Bench evaluation.

Usage:
uv run swebench-eval <path_to_output.jsonl>
"""

import argparse
import json
import subprocess
import sys
from pathlib import Path

from benchmarks.utils.patch_utils import remove_files_from_patch
from openhands.sdk import get_logger


logger = get_logger(__name__)


def convert_to_swebench_format(
input_file: str, output_file: str, model_name: str = "OpenHands"
) -> None:
"""
Convert OpenHands output.jsonl to SWE-Bench prediction format.

OpenHands format:
{
"instance_id": "django__django-11333",
"test_result": {
"git_patch": "diff --git a/file.py b/file.py\n..."
},
"instruction": "...",
"error": null,
"history": [...]
}

SWE-Bench format:
{
"instance_id": "django__django-11333",
"model_patch": "diff --git a/file.py b/file.py\n...",
"model_name_or_path": "OpenHands"
}
"""
logger.info(f"Converting {input_file} to SWE-Bench format: {output_file}")

converted_count = 0
error_count = 0

with open(input_file, "r") as infile, open(output_file, "w") as outfile:
for line_num, line in enumerate(infile, 1):
try:
line = line.strip()
if not line:
continue

data = json.loads(line)

# Extract required fields
instance_id = data.get("instance_id")
if not instance_id:
logger.warning(f"Line {line_num}: Missing instance_id")
error_count += 1
continue

# Extract git_patch from test_result
test_result = data.get("test_result", {})
git_patch = test_result.get("git_patch", "")

if not git_patch:
logger.warning(
f"Line {line_num}: Missing or empty git_patch for {instance_id}"
)
# Still create entry with empty patch
git_patch = ""

# postprocess git_patch
setup_files = ["pyproject.toml", "tox.ini", "setup.py"]
git_patch = remove_files_from_patch(git_patch, setup_files)

# Create SWE-Bench format entry
swebench_entry = {
"instance_id": instance_id,
"model_patch": git_patch,
"model_name_or_path": model_name,
}

# Write to output file
outfile.write(json.dumps(swebench_entry) + "\n")
converted_count += 1

except json.JSONDecodeError as e:
logger.error(f"Line {line_num}: Invalid JSON - {e}")
error_count += 1
except Exception as e:
logger.error(f"Line {line_num}: Unexpected error - {e}")
error_count += 1

logger.info(
f"Conversion complete: {converted_count} entries converted, "
f"{error_count} errors"
)

if converted_count == 0:
raise ValueError("No valid entries were converted")


def install_swebench() -> None:
"""Install SWE-Bench if not already available."""
import importlib.util

if importlib.util.find_spec("swebench") is not None:
logger.info("SWE-Bench is already installed")
return

logger.info("Installing SWE-Bench...")
try:
# Install SWE-Bench using uv
subprocess.run(
["uv", "add", "swebench"], check=True, capture_output=True, text=True
)
logger.info("SWE-Bench installed successfully")
except subprocess.CalledProcessError as e:
logger.error(f"Failed to install SWE-Bench: {e}")
logger.error(f"stdout: {e.stdout}")
logger.error(f"stderr: {e.stderr}")
raise
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You should just uv add swebench and maintain its version in pyproject.toml and uv.lock -- so you can get rid of this function and all associate configs

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@OpenHands please do it for me

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm on it! xingyaoww can track my progress at all-hands.dev

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Summary

I've successfully addressed the PR comment from @xingyaoww on PR #57. The requested changes have been completed and pushed to the jmj/swebench-eval-infer branch.

Changes Made:

Added swebench as a proper dependency in pyproject.toml using uv add swebench

Removed the install_swebench() function (22 lines) - This function previously attempted to dynamically install swebench at runtime

Removed the --install-swebench CLI argument and its help text

Removed the code that called install_swebench() in the main function

Updated README.md to remove the --install-swebench example and simplified the workflow description

Result:

The SWE-Bench dependency is now properly managed through pyproject.toml and uv.lock like other dependencies, eliminating the need for runtime installation logic. This is cleaner, more maintainable, and follows standard Python dependency management practices.

Commit: ec01450 - "refactor: manage swebench dependency via uv add"
Branch: jmj/swebench-eval-infer

The changes are now live on the PR and ready for review.

View full conversation



def run_swebench_evaluation(
predictions_file: str,
dataset: str = "princeton-nlp/SWE-bench_Verified",
workers: str = "12",
) -> None:
"""
Run SWE-Bench evaluation on the predictions file.

Args:
predictions_file: Path to the SWE-Bench format predictions file
dataset: SWE-Bench dataset to evaluate against
workers: Number of workers to use for evaluation
"""
logger.info(f"Running SWE-Bench evaluation on {predictions_file}")

try:
# Get the directory of the predictions file
predictions_path = Path(predictions_file)
predictions_dir = predictions_path.parent
predictions_filename = predictions_path.name

# Run SWE-Bench evaluation using global python (not UV environment)
# since swebench is installed globally
cmd = [
"/usr/bin/python3",
"-m",
"swebench.harness.run_evaluation",
"--dataset_name",
dataset,
"--predictions_path",
predictions_filename,
"--max_workers",
str(workers),
"--run_id",
f"eval_{predictions_path.stem}",
]

logger.info(f"Running command: {' '.join(cmd)}")
logger.info(f"Working directory: {predictions_dir}")
logger.info("SWE-Bench evaluation output:")
print("-" * 80)

# Stream output directly to console, running from predictions file directory
result = subprocess.run(cmd, text=True, cwd=predictions_dir)

print("-" * 80)
if result.returncode == 0:
logger.info("SWE-Bench evaluation completed successfully")
else:
logger.error(
f"SWE-Bench evaluation failed with return code {result.returncode}"
)
raise subprocess.CalledProcessError(result.returncode, cmd)

except FileNotFoundError:
logger.error(
"SWE-Bench evaluation command not found. "
"Make sure SWE-Bench is properly installed."
)
raise
except Exception as e:
logger.error(f"Error running SWE-Bench evaluation: {e}")
raise


def main() -> None:
"""Main entry point for the script."""
parser = argparse.ArgumentParser(
description="Convert OpenHands output to SWE-Bench format and run evaluation",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
uv run swebench-eval output.jsonl
uv run swebench-eval /path/to/output.jsonl --dataset princeton-nlp/SWE-bench_Lite
uv run swebench-eval output.jsonl --model-name "MyModel-v1.0"
""",
)

parser.add_argument("input_file", help="Path to the OpenHands output.jsonl file")

parser.add_argument(
"--dataset",
default="princeton-nlp/SWE-bench_Verified",
help="SWE-Bench dataset to evaluate against "
"(default: princeton-nlp/SWE-bench_Verified)",
)

parser.add_argument(
"--output-file",
help="Output file for SWE-Bench format "
"(default: input_file with .swebench.jsonl extension)",
)

parser.add_argument(
"--skip-evaluation",
action="store_true",
help="Only convert format, skip running evaluation",
)

parser.add_argument(
"--install-swebench",
action="store_true",
help="Install SWE-Bench before running evaluation",
)

parser.add_argument(
"--model-name",
default="OpenHands",
help="Model name to use in the model_name_or_path field (default: OpenHands)",
)

parser.add_argument(
"--workers",
default="12",
help="Number of workers to use when evaluating",
)

args = parser.parse_args()

# Validate input file
input_file = Path(args.input_file)
if not input_file.exists():
logger.error(f"Input file does not exist: {input_file}")
sys.exit(1)

if not input_file.suffix == ".jsonl":
logger.warning(f"Input file does not have .jsonl extension: {input_file}")

# Determine output file
if args.output_file:
output_file = Path(args.output_file)
else:
output_file = input_file.with_suffix(".swebench.jsonl")

logger.info(f"Input file: {input_file}")
logger.info(f"Output file: {output_file}")
logger.info(f"Dataset: {args.dataset}")
logger.info(f"Model name: {args.model_name}")

try:
# Convert format
convert_to_swebench_format(str(input_file), str(output_file), args.model_name)

if not args.skip_evaluation:
# Install SWE-Bench if requested
if args.install_swebench:
install_swebench()

# Run evaluation
run_swebench_evaluation(str(output_file), args.dataset, args.workers)

logger.info("Script completed successfully!")

except Exception as e:
logger.error(f"Script failed: {e}")
sys.exit(1)


if __name__ == "__main__":
main()
53 changes: 0 additions & 53 deletions benchmarks/utils/binary_patch_utils.py

This file was deleted.

Loading
Loading