Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions data_juicer/ops/mapper/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,19 @@
from .agent_bad_case_signal_mapper import AgentBadCaseSignalMapper
from .agent_cross_model_pair_mapper import AgentCrossModelPairMapper
from .agent_dialog_normalize_mapper import AgentDialogNormalizeMapper
from .agent_distill_trajectory_mapper import AgentDistillTrajectoryMapper
from .agent_error_taxonomy_mapper import AgentErrorTaxonomyMapper
from .agent_harness_noise_mapper import AgentHarnessNoiseMapper
from .agent_insight_llm_mapper import AgentInsightLLMMapper
from .agent_learnable_value_scorer import AgentLearnableValueScorer
from .agent_rewrite_hint_mapper import AgentRewriteHintMapper
from .agent_safety_gate_mapper import AgentSafetyGateMapper
from .agent_skill_insight_mapper import AgentSkillInsightMapper
from .agent_sys_log_noise_mapper import AgentSysLogNoiseMapper
from .agent_tool_relevance_mapper import AgentToolRelevanceMapper
from .agent_tool_type_mapper import AgentToolTypeMapper
from .agent_trace_coherence_mapper import AgentTraceCoherenceMapper
from .agent_training_card_mapper import AgentTrainingCardMapper
from .annotation.human_preference_annotation_mapper import (
HumanPreferenceAnnotationMapper,
)
Expand Down Expand Up @@ -135,7 +144,16 @@

__all__ = [
"AgentBadCaseSignalMapper",
"AgentCrossModelPairMapper",
"AgentDialogNormalizeMapper",
"AgentDistillTrajectoryMapper",
"AgentErrorTaxonomyMapper",
"AgentHarnessNoiseMapper",
"AgentLearnableValueScorer",
"AgentRewriteHintMapper",
"AgentSafetyGateMapper",
"AgentSysLogNoiseMapper",
"AgentTrainingCardMapper",
"AgentInsightLLMMapper",
"AgentSkillInsightMapper",
"AgentToolTypeMapper",
Expand Down
32 changes: 31 additions & 1 deletion data_juicer/ops/mapper/agent_bad_case_signal_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,16 @@
_calibration_missing_path_warned: Optional[str] = None


def _sys_log_or_harness_noise(meta: dict) -> bool:
sys_log = meta.get(MetaKeys.agent_sys_log_noise) or {}
h = meta.get(MetaKeys.agent_harness_noise) or {}
if isinstance(sys_log, dict) and sys_log.get("is_likely_noise"):
return True
if isinstance(h, dict) and h.get("is_likely_noise"):
return True
return False


def _load_calibration_json(path: str) -> Optional[Dict[str, Any]]:
if not path or not str(path).strip():
return None
Expand Down Expand Up @@ -178,6 +188,9 @@ def __init__(
signal_on_low_dialog_quality_meta: bool = True,
dialog_quality_low_score_threshold: float = 2.0,
min_dialog_quality_low_axes_for_signal: int = 1,
# --- training-dataset gating / noise (optional; keeps legacy audit tiers unchanged) ---
exclude_if_sys_log_or_harness_noise: bool = False,
signal_on_learnable_value_tier: bool = False,
**kwargs,
):
super().__init__(**kwargs)
Expand Down Expand Up @@ -235,6 +248,8 @@ def __init__(
self.signal_on_low_dialog_quality_meta = bool(signal_on_low_dialog_quality_meta)
self.dialog_quality_low_score_threshold = float(dialog_quality_low_score_threshold)
self.min_dialog_quality_low_axes_for_signal = max(1, int(min_dialog_quality_low_axes_for_signal))
self.exclude_if_sys_log_or_harness_noise = bool(exclude_if_sys_log_or_harness_noise)
self.signal_on_learnable_value_tier = bool(signal_on_learnable_value_tier)

def _resolve_calibration_row(self, meta: dict) -> Dict[str, Any]:
if not self.auto_calibrate_thresholds or not self._calibration:
Expand Down Expand Up @@ -364,11 +379,14 @@ def process_single(self, sample: dict) -> dict:

fail_count = int(meta.get(MetaKeys.tool_fail_count) or 0)
if self.signal_on_tool_fail and fail_count >= self.min_tool_fail_count_for_signal:
tw = "high"
if self.exclude_if_sys_log_or_harness_noise and _sys_log_or_harness_noise(meta):
tw = "medium"
self._append(
signals,
"tool_message_error_pattern",
f"tool_fail_count={fail_count}",
"high",
tw,
)

succ = int(meta.get(MetaKeys.tool_success_count) or 0)
Expand All @@ -378,6 +396,7 @@ def process_single(self, sample: dict) -> dict:
self.signal_on_low_tool_success_ratio
and rounds >= self.min_tool_rounds_for_ratio_signal
and ratio is not None
and float(ratio) >= 0.0
and float(ratio) <= self.tool_success_ratio_max_for_signal
):
self._append(
Expand Down Expand Up @@ -510,6 +529,17 @@ def process_single(self, sample: dict) -> dict:
"medium",
)

if self.signal_on_learnable_value_tier:
tier = meta.get(MetaKeys.agent_learnable_value_tier)
lv = meta.get(MetaKeys.agent_learnable_value)
if isinstance(tier, str) and tier.strip():
self._append(
signals,
f"learnable_value_tier_{tier.strip()}",
f"tier={tier},learnable_value={lv}",
"medium",
)

meta[MetaKeys.agent_bad_case_signals] = signals

mediums = [s for s in signals if s.get("weight") == "medium"]
Expand Down
Loading
Loading