Merge pull request #1126 from gpt-engineer-org/bench_config

viborc · web-flow · commit 7630560df3ef · 2024-04-25T19:16:34.000+02:00
Bench config
diff --git a/gpt_engineer/benchmark/__init__.py b/gpt_engineer/benchmark/__init__.py
diff --git a/gpt_engineer/benchmark/__main__.py b/gpt_engineer/benchmark/__main__.py
@@ -20,6 +20,7 @@
     The standard boilerplate for invoking the main function when the script is executed.
 """
 import importlib
+import os.path
 
 from typing import Annotated, Optional
 
@@ -29,6 +30,7 @@
 from langchain.globals import set_llm_cache
 
 from gpt_engineer.applications.cli.main import load_env_if_needed
+from gpt_engineer.benchmark.bench_config import BenchConfig
 from gpt_engineer.benchmark.benchmarks.load import get_benchmark
 from gpt_engineer.benchmark.run import print_results, run
 
@@ -69,12 +71,9 @@ def main(
             help="python file that contains a function called 'default_config_agent'"
         ),
     ],
-    benchmarks: Annotated[
-        str, typer.Argument(help="benchmark name(s) separated by ','")
-    ],
-    task_name: Annotated[
+    bench_config: Annotated[
         Optional[str], typer.Argument(help="optional task name in benchmark")
-    ] = None,
+    ] = os.path.join(os.path.dirname(__file__), "default_bench_config.toml"),
     verbose: Annotated[
         bool, typer.Option(help="print results for each task", show_default=False)
     ] = False,
@@ -88,8 +87,8 @@ def main(
         The file path to the Python module that contains a function called 'default_config_agent'.
     benchmarks : str
         A comma-separated string of benchmark names to run.
-    task_name : Optional[str], default=None
-        An optional task name to run within the benchmark.
+    bench_config : Optional[str], default=default_bench_config.toml
+        Configuration file for choosing which benchmark problems to run. See default config for more details.
     verbose : bool, default=False
         A flag to indicate whether to print results for each task.
 
@@ -99,13 +98,27 @@ def main(
     """
     set_llm_cache(SQLiteCache(database_path=".langchain.db"))
     load_env_if_needed()
+    config = BenchConfig.from_toml(bench_config)
+    print("using config file: " + bench_config)
+    benchmarks = list()
+    for specific_config_name in vars(config):
+        specific_config = getattr(config, specific_config_name)
+        if hasattr(specific_config, "active"):
+            if specific_config.active:
+                benchmarks.append(specific_config_name)
 
-    benchmarks = benchmarks.split(",")
     for benchmark_name in benchmarks:
-        benchmark = get_benchmark(benchmark_name)
+        benchmark = get_benchmark(benchmark_name, config)
+        if len(benchmark.tasks) == 0:
+            print(
+                benchmark_name
+                + " was skipped, since no tasks are specified. Increase the number of tasks in the config file at: "
+                + bench_config
+            )
+            continue
         agent = get_agent(path_to_agent)
 
-        results = run(agent, benchmark, task_name, verbose=verbose)
+        results = run(agent, benchmark, verbose=verbose)
         print(
             f"\n--- Results for agent {path_to_agent}, benchmark: {benchmark_name} ---"
         )
diff --git a/gpt_engineer/benchmark/bench_config.py b/gpt_engineer/benchmark/bench_config.py
@@ -0,0 +1,56 @@
+from dataclasses import dataclass, field
+from pathlib import Path
+
+from gpt_engineer.core.project_config import read_config
+
+
+@dataclass
+class AppsConfig:
+    active: bool | None = True
+    test_start_index: int | None = 0
+    test_end_index: int | None = 1
+    train_start_index: int | None = 0
+    train_end_index: int | None = 0
+
+
+@dataclass
+class MbppConfig:
+    active: bool | None = True
+    test_len: int | None = 1
+    train_len: int | None = 0
+
+
+@dataclass
+class GptmeConfig:
+    active: bool | None = True
+
+
+@dataclass
+class GptengConfig:
+    active: bool | None = True
+
+
+@dataclass
+class BenchConfig:
+    """Configuration for the GPT Engineer CLI and gptengineer.app via `gpt-engineer.toml`."""
+
+    apps: AppsConfig = field(default_factory=AppsConfig)
+    mbpp: MbppConfig = field(default_factory=MbppConfig)
+    gptme: GptmeConfig = field(default_factory=GptmeConfig)
+    gpteng: GptengConfig = field(default_factory=GptengConfig)
+
+    @classmethod
+    def from_toml(cls, config_file: Path | str):
+        if isinstance(config_file, str):
+            config_file = Path(config_file)
+        config_dict = read_config(config_file)
+        return cls.from_dict(config_dict)
+
+    @classmethod
+    def from_dict(cls, config_dict: dict):
+        return cls(
+            apps=AppsConfig(**config_dict.get("apps", {})),
+            mbpp=MbppConfig(**config_dict.get("mbpp", {})),
+            gptme=GptmeConfig(**config_dict.get("gptme", {})),
+            gpteng=GptengConfig(**config_dict.get("gpteng", {})),
+        )
diff --git a/gpt_engineer/benchmark/benchmarks/apps/load.py b/gpt_engineer/benchmark/benchmarks/apps/load.py
@@ -16,8 +16,8 @@
 
 from datasets import Dataset, DatasetDict, load_dataset, load_from_disk
 
+from gpt_engineer.benchmark.bench_config import AppsConfig
 from gpt_engineer.benchmark.benchmarks.apps.problem import Problem
-from gpt_engineer.benchmark.benchmarks.apps.problems import PROBLEM_IDS
 from gpt_engineer.benchmark.types import Assertable, Benchmark, Task
 from gpt_engineer.core.default.disk_execution_env import DiskExecutionEnv
 from gpt_engineer.core.files_dict import FilesDict
@@ -57,12 +57,12 @@ def _get_dataset() -> Union[Dataset, DatasetDict]:
         print("Dataset not found locally, downloading...")
 
     dataset = load_dataset("codeparrot/apps", trust_remote_code=True)
-    dataset.save_to_disk(DATASET_PATH)
+    dataset.save_to_disk(str(DATASET_PATH))
 
     return dataset
 
 
-def load_apps():
+def load_apps(config: AppsConfig) -> Benchmark:
     """
     Loads the APPS benchmark, which consists of a series coding problems.
 
@@ -73,17 +73,19 @@ def load_apps():
     """
     dataset = _get_dataset()
     tasks = []
-
-    problems = [
-        Problem(
-            id=problem["problem_id"],
-            question=problem["question"],
-            input_output=problem["input_output"],
-            starter_code=problem["starter_code"],
-        )
-        for problem in dataset["test"]
-        if problem["problem_id"] in PROBLEM_IDS
-    ]
+    problems = list()
+    for dataset_type in ["test", "train"]:
+        problems += [
+            Problem(
+                id=problem["problem_id"],
+                question=problem["question"],
+                input_output=problem["input_output"],
+                starter_code=problem["starter_code"],
+            )
+            for index, problem in enumerate(dataset[dataset_type])
+            if (index < config.__getattribute__(dataset_type + "_end_index"))
+            and (index >= config.__getattribute__(dataset_type + "_start_index"))
+        ]
 
     for problem in problems:
         prompt = Prompt(
@@ -110,6 +112,6 @@ def load_apps():
         )
 
     return Benchmark(
-        name="APPS",
+        name="apps",
         tasks=tasks,
     )
diff --git a/gpt_engineer/benchmark/benchmarks/gpteng/load.py b/gpt_engineer/benchmark/benchmarks/gpteng/load.py
@@ -19,11 +19,13 @@
 
 from pathlib import Path
 
+from gpt_engineer.benchmark.bench_config import GptengConfig
 from gpt_engineer.benchmark.benchmarks.gpteng.eval_tools import (
     check_evaluation_component,
 )
 from gpt_engineer.benchmark.types import Assertable, Benchmark, Task
 from gpt_engineer.core.chat_to_files import chat_to_files_dict
+from gpt_engineer.core.prompt import Prompt
 
 evaluations = [
     {
@@ -192,7 +194,7 @@ def eval_to_task(case):
     return Task(
         name=case["name"],
         initial_code=chat_to_files_dict(Path(case["code_blob"]).read_text()),
-        prompt=prompt,
+        prompt=Prompt(prompt),
         command=None,
         assertions={
             f"{e['type']}_{i}": expect_to_assertion(e)
@@ -201,7 +203,7 @@ def eval_to_task(case):
     )
 
 
-def load_gpteng():
+def load_gpteng(config: GptengConfig) -> Benchmark:
     """
     Loads the GPT-Eng benchmark, which consists of a series of tasks for evaluation.
 
diff --git a/gpt_engineer/benchmark/benchmarks/gptme/load.py b/gpt_engineer/benchmark/benchmarks/gptme/load.py
@@ -10,12 +10,13 @@
 load_gptme : function
     Loads the GPT-Me benchmark, which consists of a series of tasks for evaluation.
 """
+from gpt_engineer.benchmark.bench_config import GptmeConfig
 from gpt_engineer.benchmark.types import Benchmark, Task
 from gpt_engineer.core.files_dict import FilesDict
 from gpt_engineer.core.prompt import Prompt
 
 
-def load_gptme():
+def load_gptme(config: GptmeConfig) -> Benchmark:
     """
     Loads the GPT-Me benchmark, which consists of a series of tasks for evaluation.
 
diff --git a/gpt_engineer/benchmark/benchmarks/load.py b/gpt_engineer/benchmark/benchmarks/load.py
@@ -9,6 +9,7 @@
 get_benchmark : function
     Retrieves a Benchmark object by name. Raises ValueError if the benchmark is unknown.
 """
+from gpt_engineer.benchmark.bench_config import BenchConfig
 from gpt_engineer.benchmark.benchmarks.apps.load import load_apps
 from gpt_engineer.benchmark.benchmarks.gpteng.load import load_gpteng
 from gpt_engineer.benchmark.benchmarks.gptme.load import load_gptme
@@ -23,14 +24,16 @@
 }
 
 
-def get_benchmark(name: str) -> Benchmark:
+def get_benchmark(name: str, config: BenchConfig) -> Benchmark:
     """
     Retrieves a Benchmark object by name. Raises ValueError if the benchmark is unknown.
 
     Parameters
     ----------
     name : str
         The name of the benchmark to retrieve.
+    config : BenchConfig
+        Configuration object for the benchmarks.
 
     Returns
     -------
@@ -44,4 +47,4 @@ def get_benchmark(name: str) -> Benchmark:
     """
     if name not in BENCHMARKS:
         raise ValueError(f"Unknown benchmark {name}.")
-    return BENCHMARKS[name]()
+    return BENCHMARKS[name](config.__getattribute__(name))
diff --git a/gpt_engineer/benchmark/benchmarks/mbpp/load.py b/gpt_engineer/benchmark/benchmarks/mbpp/load.py
@@ -16,8 +16,8 @@
 
 from datasets import Dataset, DatasetDict, load_dataset, load_from_disk
 
+from gpt_engineer.benchmark.bench_config import MbppConfig
 from gpt_engineer.benchmark.benchmarks.mbpp.problem import Problem
-from gpt_engineer.benchmark.benchmarks.mbpp.problems import PROBLEM_IDS
 from gpt_engineer.benchmark.types import Assertable, Benchmark, Task
 from gpt_engineer.core.default.disk_execution_env import DiskExecutionEnv
 from gpt_engineer.core.files_dict import FilesDict
@@ -57,12 +57,12 @@ def _get_dataset() -> Union[Dataset, DatasetDict]:
         print("Dataset not found locally, downloading...")
 
     dataset = load_dataset("mbpp", "sanitized", trust_remote_code=True)
-    dataset.save_to_disk(DATASET_PATH)
+    dataset.save_to_disk(str(DATASET_PATH))
 
     return dataset
 
 
-def load_mbpp():
+def load_mbpp(config: MbppConfig) -> Benchmark:
     """
     Loads the MBPP benchmark, which consists of a series coding problems.
 
@@ -73,19 +73,20 @@ def load_mbpp():
     """
     dataset = _get_dataset()
     tasks = []
-
-    problems = [
-        Problem(
-            source_file=problem["source_file"],
-            task_id=problem["task_id"],
-            prompt=problem["prompt"],
-            code=problem["code"],
-            test_imports=problem["test_imports"],
-            test_list=problem["test_list"],
-        )
-        for problem in dataset["test"]
-        if problem["task_id"] in PROBLEM_IDS
-    ]
+    problems = []
+    for dataset_type in ["test", "train"]:
+        problems += [
+            Problem(
+                source_file=problem["source_file"],
+                task_id=problem["task_id"],
+                prompt=problem["prompt"],
+                code=problem["code"],
+                test_imports=problem["test_imports"],
+                test_list=problem["test_list"],
+            )
+            for index, problem in enumerate(dataset[dataset_type])
+            if index < config.__getattribute__(dataset_type + "_len")
+        ]
 
     for problem in problems:
         prompt = Prompt(
@@ -109,6 +110,6 @@ def load_mbpp():
         )
 
     return Benchmark(
-        name="MBPP",
+        name="mbpp",
         tasks=tasks,
     )
diff --git a/gpt_engineer/benchmark/default_bench_config.toml b/gpt_engineer/benchmark/default_bench_config.toml
@@ -0,0 +1,19 @@
+# For apps, the maximal range is 0:5000 for both train and test
+[apps]
+active = true
+test_start_index = 0
+test_end_index = 2
+train_start_index = 0
+train_end_index = 2
+
+# For mbpp, the maximal range is 0:47
+[mbpp]
+active = true
+test_len = 2
+train_len = 2
+
+[gpteng]
+active = true
+
+[gptme]
+active = true
diff --git a/gpt_engineer/benchmark/run.py b/gpt_engineer/benchmark/run.py
@@ -14,7 +14,7 @@
 """
 import time
 
-from typing import List, Optional
+from typing import List
 
 from gpt_engineer.benchmark.types import Assertable, Benchmark, TaskResult
 from gpt_engineer.core.base_agent import BaseAgent
@@ -24,7 +24,6 @@
 def run(
     agent: BaseAgent,
     benchmark: Benchmark,
-    task_name: Optional[str] = None,
     verbose=False,
 ) -> List[TaskResult]:
     """
@@ -36,8 +35,6 @@ def run(
         The agent to use for running the benchmark tasks.
     benchmark : Benchmark
         The benchmark containing the tasks to run.
-    task_name : Optional[str], default=None
-        An optional name of a specific task to run within the benchmark.
     verbose : bool, default=False
         A flag to indicate whether to print verbose output during the benchmark.
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -96,3 +96,8 @@ section-order = [
 combine-as-imports = true
 split-on-trailing-comma = false
 lines-between-types = 1
+
+[tool.pytest.ini_options]
+markers = [
+    "requires_key: marks tests as requiring access to a valid OPENAI_API_KEY (deselect with '-m \"not requires_key\"')",
+]
diff --git a/tests/benchmark/test_BenchConfig.py b/tests/benchmark/test_BenchConfig.py