ENH improve validation of required runtime fields after resolving benchmark defaults

dantegd · dantegd · commit 1b5625cb8939 · 2026-04-22T16:02:02.000-05:00
diff --git a/python/cuml/cuml/benchmark/config.py b/python/cuml/cuml/benchmark/config.py
@@ -457,9 +457,30 @@ def _validate_post_defaults_entry(entry: dict[str, Any]) -> None:
         entry, context=f"benchmark '{entry.get('id', entry['algorithm'])}'", require_algorithm=True
     )
 
+    benchmark_name = entry.get("id", entry["algorithm"])
+
+    for field in ("dataset", "input_type", "dtype"):
+        if not isinstance(entry.get(field), str) or not entry[field]:
+            raise BenchmarkConfigError(
+                f"Benchmark '{benchmark_name}' must define a non-empty "
+                f"'{field}' after applying defaults"
+            )
+
+    if not isinstance(entry.get("n_reps"), int):
+        raise BenchmarkConfigError(
+            f"Benchmark '{benchmark_name}' must define integer 'n_reps' "
+            "after applying defaults"
+        )
+
+    if not isinstance(entry.get("test_split"), (int, float)):
+        raise BenchmarkConfigError(
+            f"Benchmark '{benchmark_name}' must define numeric 'test_split' "
+            "after applying defaults"
+        )
+
     if not entry.get("run_cpu", True) and not entry.get("run_gpu", True):
         raise BenchmarkConfigError(
-            f"Benchmark '{entry.get('id', entry['algorithm'])}' cannot "
+            f"Benchmark '{benchmark_name}' cannot "
             "disable both CPU and GPU execution"
         )
 
diff --git a/python/cuml/tests/test_benchmark_config.py b/python/cuml/tests/test_benchmark_config.py
@@ -7,8 +7,9 @@
 from pathlib import Path
 
 import pandas as pd
+import pytest
 
-from cuml.benchmark.config import load_and_resolve_config
+from cuml.benchmark.config import BenchmarkConfigError, load_and_resolve_config
 from cuml.benchmark.run_benchmarks import _run_config_benchmarks, main
 
 
@@ -152,6 +153,57 @@ def test_load_and_resolve_config_expands_shape_pairs_and_param_grid(tmp_path):
     assert entry["param_override_list"] == [{"C": 0.25}, {"C": 1.0}]
 
 
+@pytest.mark.parametrize(
+    ("defaults_block", "expected_field"),
+    [
+        ("", "dataset"),
+        ("  dataset: classification\n", "input_type"),
+        ("  dataset: classification\n  input_type: numpy\n", "dtype"),
+        (
+            "  dataset: classification\n  input_type: numpy\n  dtype: fp32\n",
+            "n_reps",
+        ),
+        (
+            "  dataset: classification\n"
+            "  input_type: numpy\n"
+            "  dtype: fp32\n"
+            "  n_reps: 2\n",
+            "test_split",
+        ),
+    ],
+)
+def test_load_and_resolve_config_requires_runtime_fields_after_defaults(
+    tmp_path, defaults_block, expected_field
+):
+    config_path = tmp_path / "missing-required.yaml"
+    config_path.write_text(
+        (
+            "version: 1\n\n"
+            "suite:\n"
+            "  name: missing-required\n"
+            "  tier: test\n"
+            "  description: missing field coverage\n\n"
+            "defaults:\n"
+            f"{defaults_block}"
+            "  run_cpu: true\n"
+            "  run_gpu: false\n\n"
+            "benchmarks:\n"
+            "  - id: shaped_logreg\n"
+            "    algorithm: LogisticRegression\n"
+            "    operation: fit\n"
+            "    rows: [100]\n"
+            "    features: [8]\n"
+        ),
+        encoding="utf-8",
+    )
+
+    with pytest.raises(
+        BenchmarkConfigError,
+        match=rf"must define .*'{expected_field}'.*after applying defaults",
+    ):
+        load_and_resolve_config(str(config_path))
+
+
 def test_run_config_benchmarks_uses_shape_pairs_without_cartesian_product(
     monkeypatch,
 ):