-
Notifications
You must be signed in to change notification settings - Fork 0
Implementation Plan: Add Red-Team Severity Calibration by Experiment Type in review-design #614
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 6 commits
81f2f28
80b1c6c
a63d879
3e681c2
a6b3cbe
c83938f
21d50bc
292dc3c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -294,3 +294,78 @@ def test_l3_l4_subagents_receive_experiment_type( | |
| f"{step_heading}: L3/L4 subagents must receive experiment_type. " | ||
| "Type-agnostic severity calibration is a structural gap." | ||
| ) | ||
|
|
||
|
|
||
| # ── Red-team severity calibration ───────────────────────────────────────────── | ||
|
|
||
|
|
||
| def test_red_team_severity_calibration_rubric_present(skill_text: str) -> None: | ||
| """Red-team dimension must have a severity calibration rubric by experiment type. | ||
|
|
||
| Without this rubric, any critical red-team finding triggers STOP regardless | ||
| of experiment type, creating an unresolvable loop for benchmarks. | ||
| """ | ||
| rt_cal_idx = skill_text.lower().find("red-team severity calibration") | ||
| assert rt_cal_idx != -1, ( | ||
| "Red-team severity calibration rubric not found in SKILL.md. " | ||
| "Without it, any critical red-team finding triggers STOP regardless " | ||
| "of experiment type." | ||
| ) | ||
| rt_section = skill_text[rt_cal_idx : rt_cal_idx + 1000] | ||
Trecek marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| for exp_type in ["causal_inference", "benchmark", "exploratory"]: | ||
| assert exp_type in rt_section, ( | ||
| f"Red-team calibration rubric must specify {exp_type} severity cap." | ||
| ) | ||
|
|
||
|
|
||
| def test_red_team_severity_cap_applied_before_verdict(skill_text: str) -> None: | ||
| """Severity cap must be applied BEFORE building stop_triggers in verdict logic. | ||
|
|
||
| Without this ordering, red-team criticals bypass the cap and still trigger STOP. | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [critical] tests: Syntax error: the docstring for
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Investigated — this is intentional. The docstring is correctly formed: |
||
| """ | ||
| step7_text = skill_text_between("### Step 7", "### Step 8", skill_text) | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [warning] tests:
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Investigated — this is intentional. |
||
| cap_idx = step7_text.find("rt_cap") | ||
| stop_idx = step7_text.find('f.dimension == "red_team"') | ||
| assert cap_idx != -1, ( | ||
| "Step 7 verdict logic must reference rt_cap for red-team severity capping." | ||
| ) | ||
| assert stop_idx != -1, ( | ||
| "Step 7 verdict logic must reference red_team dimension in stop_triggers." | ||
| ) | ||
| assert cap_idx < stop_idx, ( | ||
| "rt_cap must be applied BEFORE the red_team stop_triggers line — " | ||
| "otherwise the cap has no effect on STOP eligibility." | ||
| ) | ||
|
|
||
|
|
||
| def _parse_rt_rubric(skill_text: str) -> dict[str, str]: | ||
Trecek marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| """Parse the red-team severity calibration rubric into {experiment_type: severity}.""" | ||
| rt_cal_idx = skill_text.lower().find("red-team severity calibration") | ||
| assert rt_cal_idx != -1, "Red-team severity calibration rubric not found" | ||
| rt_section = skill_text[rt_cal_idx : rt_cal_idx + 1000] | ||
| table_lines = [ln for ln in rt_section.splitlines() if "|" in ln and "---" not in ln] | ||
Trecek marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| assert len(table_lines) >= 2, "Rubric must have header + data row" | ||
Trecek marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| headers = [c.strip().lower() for c in table_lines[0].split("|") if c.strip()] | ||
| values = [c.strip().lower() for c in table_lines[1].split("|") if c.strip()] | ||
| return dict(zip(headers[1:], values[1:])) | ||
Trecek marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
|
|
||
| def test_benchmark_red_team_cannot_stop(skill_text: str) -> None: | ||
| """Benchmark experiment type must cap red-team severity at warning (no STOP).""" | ||
| rubric = _parse_rt_rubric(skill_text) | ||
| assert "benchmark" in rubric, "Benchmark column not found in red-team calibration rubric" | ||
| assert rubric["benchmark"] == "warning", ( | ||
| "Benchmark red-team severity must be capped at 'warning' — " | ||
| "STOP-eligible red-team findings are unreasonable for benchmarks." | ||
| ) | ||
|
|
||
|
|
||
| def test_causal_inference_red_team_can_stop(skill_text: str) -> None: | ||
| """causal_inference must retain critical as max red-team severity (STOP eligible).""" | ||
| rubric = _parse_rt_rubric(skill_text) | ||
| assert "causal_inference" in rubric, ( | ||
| "causal_inference column not found in red-team calibration rubric" | ||
| ) | ||
| assert rubric["causal_inference"] == "critical", ( | ||
| "causal_inference must retain critical as max red-team severity." | ||
| ) | ||
Uh oh!
There was an error while loading. Please reload this page.