Skip to content

Commit 54c8098

Browse files
innoscoutproclaude
andcommitted
feat: LLMJudge with completeness rubric + 6-strategy JSON parser
Addresses upstream issue NousResearch#33 (C1: keyword-only metric) and forward-ports the more polished pieces of upstream PR NousResearch#25 and PR NousResearch#39 partial. evolution/core/fitness.py - Replace conciseness dimension with completeness — judges should penalise omissions, not reward brevity. Composite weight now 0.4 correctness + 0.3 procedure + 0.3 completeness. - New init_fitness_metric(config, skill_text, use_llm_judge=True) / reset_fitness_metric() pair. When use_llm_judge=True, an LLMJudge with the completeness rubric is the primary scorer; the deterministic multi-signal scorer becomes the fallback. When False (default), the metric stays purely deterministic and zero-cost — appropriate for fast iteration and for runs the user doesn't want to send to a judge. - skill_fitness_metric accepts the 5-arg GEPA signature (gold, pred, trace, pred_name, pred_trace) so it works with both GEPA and the legacy 3-arg metric API. - Judge failures fall through to deterministic with a "[judge unavailable: <ExceptionClass>]" prefix in feedback so users can see why scores look heuristic mid-run. evolution/core/dataset_builder.py - Replace inline 3-strategy JSON recovery with a 6-strategy _try_parse_json_list helper: direct json, ast.literal_eval (safer than eval, but parses Python-literal single-quoted dicts), array-extraction-then-parse, ast.literal_eval on extracted candidate, trailing-comma-and-quote-fix, markdown-fence stripping, and a last-resort per-block scan. Returns None instead of raising so the caller can produce a useful error. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 359ccca commit 54c8098

2 files changed

Lines changed: 214 additions & 148 deletions

File tree

evolution/core/dataset_builder.py

Lines changed: 88 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,10 @@
66
C) Golden sets — hand-curated JSONL files
77
"""
88

9+
import ast
910
import json
1011
import random
12+
import re
1113
from pathlib import Path
1214
from dataclasses import dataclass, field
1315
from typing import Optional
@@ -17,6 +19,84 @@
1719
from evolution.core.config import EvolutionConfig
1820

1921

22+
def _try_parse_json_list(text: str) -> Optional[list]:
23+
"""Parse a JSON list out of LLM-emitted text using progressively more
24+
aggressive recovery strategies.
25+
26+
LLMs frequently emit malformed JSON: trailing commas, single quotes,
27+
surrounding markdown fences, prose preambles. We try the cheap,
28+
well-defined parses first and only reach for `ast.literal_eval` (which
29+
is safer than `eval` but still parses Python literal syntax) on the
30+
extracted-array candidates. Returns `None` if everything fails.
31+
"""
32+
text = text.strip()
33+
34+
# 1. Direct JSON.
35+
try:
36+
result = json.loads(text)
37+
if isinstance(result, list):
38+
return result
39+
except json.JSONDecodeError:
40+
pass
41+
42+
# 2. ast.literal_eval — handles single-quoted dicts/strings.
43+
try:
44+
result = ast.literal_eval(text)
45+
if isinstance(result, list):
46+
return result
47+
except (ValueError, SyntaxError):
48+
pass
49+
50+
# 3. Extract the first plausible array from surrounding prose.
51+
match = re.search(r"\[\s*\{.*\}\s*\]", text, re.DOTALL)
52+
if match:
53+
candidate = match.group()
54+
try:
55+
result = json.loads(candidate)
56+
if isinstance(result, list):
57+
return result
58+
except json.JSONDecodeError:
59+
pass
60+
try:
61+
result = ast.literal_eval(candidate)
62+
if isinstance(result, list):
63+
return result
64+
except (ValueError, SyntaxError):
65+
pass
66+
67+
# 4. Fix trailing commas + naive single-quote rewrite, then retry.
68+
fixed = re.sub(r",\s*([}\]])", r"\1", text)
69+
fixed = re.sub(r"(?<!\\)'([^']+?)'(?=\s*[:,\]\}])", r'"\1"', fixed)
70+
try:
71+
result = json.loads(fixed)
72+
if isinstance(result, list):
73+
return result
74+
except json.JSONDecodeError:
75+
pass
76+
77+
# 5. Strip markdown code fences.
78+
stripped = re.sub(r"^```(?:json)?\s*", "", text, flags=re.MULTILINE)
79+
stripped = re.sub(r"\s*```$", "", stripped)
80+
try:
81+
result = json.loads(stripped)
82+
if isinstance(result, list):
83+
return result
84+
except json.JSONDecodeError:
85+
pass
86+
87+
# 6. Last-ditch: collect every `{...}` block individually.
88+
blocks: list = []
89+
for block_match in re.finditer(r"\{[^{}]*\}", text, re.DOTALL):
90+
try:
91+
blocks.append(json.loads(block_match.group()))
92+
except json.JSONDecodeError:
93+
continue
94+
if blocks:
95+
return blocks
96+
97+
return None
98+
99+
20100
@dataclass
21101
class EvalExample:
22102
"""A single evaluation example."""
@@ -132,56 +212,15 @@ def generate(
132212
num_cases=n,
133213
)
134214

135-
# Parse the generated test cases — LLMs often produce slightly malformed JSON
136-
import re
215+
# Parse the generated test cases. LLMs often produce slightly
216+
# malformed JSON; `_try_parse_json_list` tries six progressively more
217+
# aggressive recovery strategies before giving up.
137218
raw_text = result.test_cases
138-
139-
def _try_parse_json(text: str) -> list:
140-
"""Try multiple strategies to parse LLM JSON output."""
141-
# Strategy 1: Direct JSON parse
142-
try:
143-
return json.loads(text)
144-
except json.JSONDecodeError:
145-
pass
146-
147-
# Strategy 2: Python literal eval — handles single-quoted dicts
148-
try:
149-
import ast
150-
result = ast.literal_eval(text.strip())
151-
if isinstance(result, list):
152-
return result
153-
except (ValueError, SyntaxError):
154-
pass
155-
156-
# Strategy 3: Extract array from surrounding text
157-
match = re.search(r'\[[\s\S]*\]', text)
158-
if match:
159-
candidate = match.group()
160-
try:
161-
return json.loads(candidate)
162-
except json.JSONDecodeError:
163-
pass
164-
165-
# Try literal_eval on extracted array
166-
try:
167-
import ast
168-
result = ast.literal_eval(candidate)
169-
if isinstance(result, list):
170-
return result
171-
except (ValueError, SyntaxError):
172-
pass
173-
174-
# Fix common LLM JSON issues
175-
fixed = re.sub(r',\s*([}\]])', r'\1', candidate)
176-
fixed = re.sub(r"(?<!\\)'([^']+?)'(?=\s*[:,\]\}])", r'"\1"', fixed)
177-
try:
178-
return json.loads(fixed)
179-
except json.JSONDecodeError:
180-
pass
181-
182-
raise ValueError(f"Could not parse test cases from LLM output: {raw_text[:500]}")
183-
184-
cases_raw = _try_parse_json(raw_text)
219+
cases_raw = _try_parse_json_list(raw_text)
220+
if cases_raw is None:
221+
raise ValueError(
222+
f"Could not parse test cases from LLM output: {raw_text[:500]}"
223+
)
185224

186225
examples = [
187226
EvalExample(

0 commit comments

Comments
 (0)