Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
from metrics_computation_engine.entities.models.session import SessionEntity
from metrics_computation_engine.entities.models.span import SpanEntity
from metrics_computation_engine.types import AggregationLevel
from metrics_computation_engine.entities.core.agent_role_detector import (
get_agent_role_and_skip_decision,
)

from .metric_configuration import MetricConfiguration, build_metric_configuration_map
from .model_loader import MODEL_PROVIDER_NAME, load_model
Expand All @@ -26,7 +29,7 @@ class DeepEvalMetricAdapter(BaseMetric):
Adapter to integrate DeepEval metrics as 3rd party plugins into the MCE.
"""

def __init__(self, deepeval_metric_name: str):
def __init__(self, deepeval_metric_name: str, filter_coordinators: bool = True):
super().__init__()
metric_configuration_map: Dict[str, MetricConfiguration] = (
build_metric_configuration_map()
Expand All @@ -40,6 +43,7 @@ def __init__(self, deepeval_metric_name: str):
self.name = deepeval_metric_name
self.deepeval_metric = None
self.model = None
self.filter_coordinators = filter_coordinators
metric_configuration: MetricConfiguration = metric_configuration_map[
deepeval_metric_name
]
Expand Down Expand Up @@ -85,6 +89,9 @@ def init_with_model(self, model: Any) -> bool:
def create_model(self, llm_config: LLMJudgeConfig) -> Any:
return load_model(llm_config)

def supports_agent_computation(self) -> bool:
return True

@property
def required_parameters(self):
"""Map DeepEval required params to your framework's format"""
Expand Down Expand Up @@ -259,3 +266,96 @@ async def compute(
success=False,
error_message=str(e),
)

async def compute_agent_level(
self, data: SpanEntity | SessionEntity
) -> List[MetricResult]:
"""
Compute agent-level metrics for a given session.
"""
results: List[MetricResult] = []
if not data.agent_stats:
return results

session_id = [data.session_id]
app_name = data.spans[0].app_name
category = "agent"

for agent_name in data.agent_stats.keys():
try:
# Check if agent should be skipped based on role detection
should_skip, role_metadata = get_agent_role_and_skip_decision(
data, agent_name, filter_coordinators=self.filter_coordinators
)

if should_skip:
# Skip this agent entirely - don't include in results
# Log the skip for debugging purposes
logger.info(
f"Skipping agent '{agent_name}' for DeepEvalMetric metric: {role_metadata.get('skip_reason', 'Detected as coordinator agent')}"
)
continue

test_case_calculator = self.metric_configuration.test_case_calculator
test_case = test_case_calculator.calculate_test_case_with_agent(
data=data, agent_name=agent_name
)

# Use async version if available, otherwise fallback to sync
if hasattr(self.deepeval_metric, "a_measure"):
score = await self.deepeval_metric.a_measure(test_case)
else:
score = self.deepeval_metric.measure(test_case)

# Extract additional metadata from the metric
metadata = {
"threshold": getattr(self.deepeval_metric, "threshold", None),
"success": getattr(self.deepeval_metric, "success", None),
"reason": getattr(self.deepeval_metric, "reason", None),
"evaluation_cost": getattr(
self.deepeval_metric, "evaluation_cost", None
),
}

logger.info(f"metadata: {metadata}")
# Filter out None values
metadata = {k: v for k, v in metadata.items() if v is not None}

agent_spans = data._get_spans_for_agent(agent_name)
agent_span_ids = [span.span_id for span in agent_spans]

logger.info(f"aggregation level: {self.aggregation_level}")
results.append(
MetricResult(
metric_name=self.name,
description="",
value=score,
reasoning=metadata["reason"],
unit="",
aggregation_level=self.aggregation_level,
category=category,
app_name=app_name,
agent_id=agent_name,
span_id=agent_span_ids,
session_id=session_id,
source="deepeval",
entities_involved=[],
edges_involved=[],
success=getattr(
self.deepeval_metric, "success", score is not None
),
metadata=metadata,
error_message=None,
)
)

except Exception as e:
# Handle errors gracefully for individual agents
import traceback

# Log detailed error information for debugging
logger.error(f"ERROR in DeepEval computation for agent {agent_name}:")
logger.error(f"Exception type: {type(e).__name__}")
logger.error(f"Exception message: {str(e)}")
logger.error(f"Full traceback:\n{traceback.format_exc()}")
return results
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from deepeval.metrics import (
BiasMetric,
ConversationCompletenessMetric,
TurnRelevancyMetric,
GEval,
RoleAdherenceMetric,
TaskCompletionMetric,
Expand Down Expand Up @@ -102,6 +103,16 @@ def build_metric_configurations() -> List[MetricConfiguration]:
),
metric_class=ConversationCompletenessMetric,
),
MetricConfiguration(
metric_name=TurnRelevancyMetric.__name__,
test_case_calculator=DeepEvalTestCaseConversational(),
requirements=MetricRequirements(
entity_type=["llm"],
aggregation_level="session",
required_input_parameters=["conversation_elements"],
),
metric_class=TurnRelevancyMetric,
),
MetricConfiguration(
metric_name=BiasMetric.__name__,
test_case_calculator=DeepEvalTestCaseLLM(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,6 @@ def calculate_test_case(
output=tool_call.output,
)
)

return LLMTestCase(
input=user_input, actual_output=final_response, tools_called=tools_called
)
Expand All @@ -96,8 +95,43 @@ def calculate_test_case(
role = "assistant"
turns.append(Turn(role=role, content=element.content))

print("LLM Conversation Turns:", turns) # Debugging line
return ConversationalTestCase(chatbot_role=chatbot_role, turns=turns)

def calculate_test_case_with_agent(
self, data: Union[SpanEntity, SessionEntity], agent_name: str
) -> Union[ConversationalTestCase, LLMTestCase]:
"""
Create conversational test case from SessionEntity data for a given agent.
"""
data: SessionEntity = _make_sure_input_is_session_entity(data=data)
agent_role = "assistant"
# _, agent_role_metadata = get_agent_role_and_skip_decision(data, agent_name)
# if agent_role_metadata:
# agent_role = agent_role_metadata.detected_role or "assistant"
# if agent_name == "unknown":
# agent_role = "assistant"

# print(f"Agent Role for {agent_name}: {agent_role}")
agent_conversation = data.get_agent_conversation_data(agent_name)
if not agent_conversation:
raise ValueError(f"No conversation elements found for agent {agent_name}")
turns = []
print("AGENT Conversation Elements for agent:", agent_name)

for element in agent_conversation.get("elements", []):
role = element.get("role", "assistant")
if type(role) is not str:
continue
if role not in ["user", "assistant"]:
role = "assistant"
# if role == "system":
# role = "assistant"
# else:
# continue
turns.append(Turn(role=role, content=str(element.get("content", ""))))
return ConversationalTestCase(chatbot_role=agent_role, turns=turns)


class LLMAnswerRelevancyTestCase(AbstractTestCaseCalculator):
def calculate_test_case(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ async def compute_agent_level(self, session: SessionEntity) -> List[MetricResult

result = self._create_error_result(
error_message=f"Agent '{agent_name}' missing input_query or final_response data",
category="application",
category="agent",
description=self.description,
app_name=session.app_name,
entities_involved=entities_involved,
Expand Down Expand Up @@ -170,8 +170,9 @@ async def compute_agent_level(self, session: SessionEntity) -> List[MetricResult
score=score,
reasoning=f"{reasoning}",
description=self.description,
category="application",
category="agent",
app_name=session.app_name,
agent_id=agent_name,
entities_involved=entities_involved,
span_ids=agent_span_ids,
session_ids=[session.session_id],
Expand All @@ -180,7 +181,7 @@ async def compute_agent_level(self, session: SessionEntity) -> List[MetricResult
result = self._create_error_result(
error_message="No model available",
description=self.description,
category="application",
category="agent",
app_name=session.app_name,
entities_involved=entities_involved,
span_ids=agent_span_ids,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,10 +78,15 @@ async def compute(self, session: SessionEntity, **context) -> MetricResult:
conversation = ""
if session.conversation_data:
# Try both "elements" (main branch) and "conversation" (agent branch) formats
if "elements" in session.conversation_data:
conversation = session.conversation_data["elements"]
elif "conversation" in session.conversation_data:
conversation = session.conversation_data["conversation"]
# if "elements" in session.conversation_data:
# conversation = session.conversation_data["elements"]
# elif "conversation" in session.conversation_data:
# conversation = session.conversation_data["conversation"]
conversation = session.get_conversation_data_without_images()
if "elements" in conversation:
conversation = conversation["elements"]
elif "conversation" in conversation:
conversation = conversation["conversation"]

# Format conversation properly (main branch improvement)
conversation_str = (
Expand Down Expand Up @@ -194,6 +199,7 @@ async def compute_agent_level(self, session: SessionEntity) -> List[MetricResult
result = self._create_success_result(
score=score,
category="agent",
agent_id=agent_name,
app_name=session.app_name,
reasoning=reasoning,
entities_involved=[agent_name],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ async def compute_agent_level(self, session: SessionEntity):

result = self._create_error_result(
error_message=f"Agent '{agent_name}' missing input_query or final_response data",
category="application",
category="agent",
description=self.description,
app_name=session.app_name,
entities_involved=entities_involved,
Expand Down Expand Up @@ -182,8 +182,9 @@ async def compute_agent_level(self, session: SessionEntity):
score, reasoning = self.jury.judge(prompt, BinaryGrading)
result = self._create_success_result(
score=score,
category="application",
category="agent",
app_name=session.app_name,
agent_id=agent_name,
reasoning=reasoning,
entities_involved=entities_involved,
span_ids=agent_span_ids,
Expand All @@ -201,7 +202,7 @@ async def compute_agent_level(self, session: SessionEntity):
else:
result = self._create_error_result(
error_message="No model available",
category="application",
category="agent",
app_name=session.app_name,
entities_involved=entities_involved,
span_ids=agent_span_ids,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,11 +77,13 @@ async def compute(self, session: SessionEntity, **context) -> MetricResult:
**context: Additional context
"""
# Session-level computation
conversation = (
session.conversation_data.get("conversation", "")
if session.conversation_data
else ""
)
conversation = ""
if session.conversation_data:
conversation = session.get_conversation_data_without_images()
if "elements" in conversation:
conversation = conversation["elements"]
elif "conversation" in conversation:
conversation = conversation["conversation"]
agent_span_ids = (
[span.span_id for span in session.agent_spans]
if session.agent_spans
Expand Down Expand Up @@ -179,6 +181,7 @@ async def compute_agent_level(self, session: SessionEntity) -> List[MetricResult
score=score,
category="agent",
app_name=session.app_name,
agent_id=agent_name,
reasoning=reasoning,
entities_involved=[agent_name],
span_ids=agent_span_ids,
Expand Down
Loading