Skip to content

Commit 48a4c94

Browse files
authored
Merge pull request #1946 from Giskard-AI/gsk-3579-add-conversation-correctness-evaluation
Add conversation correctness evaluation
2 parents 6cdde2f + 7f7f608 commit 48a4c94

File tree

2 files changed

+48
-35
lines changed

2 files changed

+48
-35
lines changed

giskard/rag/metrics/correctness.py

Lines changed: 46 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -6,49 +6,58 @@
66
from ..question_generators.utils import parse_json_output
77
from .base import Metric
88

9-
CORRECTNESS_EVALUATION_SYSTEM_PROMPT = """Your role is to test AI agents. Your task consists in assessing whether a agent output correctly answers a question.
10-
You are provided with the ground truth answer to the question. Your task is then to evaluate if the agent answer is close to the ground thruth answer.
119

12-
You are auditing the following agent:
13-
{agent_description}
10+
def format_conversation(conversation: list[dict]):
11+
return "\n\n".join([f"<{msg['role'].lower()}>{msg['content']}</{msg['role'].lower()}>" for msg in conversation])
1412

15-
Think step by step and consider the agent output in its entirety. Remember: you need to have a strong and sound reason to support your evaluation.
16-
If the agent answer is correct, return True. If the agent answer is incorrect, return False along with the reason.
17-
You must output a single JSON object with keys 'correctness' and 'correctness_reason'. Make sure you return a valid JSON object.
1813

19-
The question that was asked to the agent, its output, and the expected ground truth answer will be delimited with XML tags.
14+
CORRECTNESS_EVALUATION_SYSTEM_PROMPT = """Your role is to test AI assistants. Your task consists in assessing whether an Agent correctly answered a question.
15+
16+
The user will provide a description of the agent, a conversation between the Agent (<assistant>) and the user (<user>), the agent answer to the last question of the conversation and the reference (true) answer. You must tell if the Agent correctly answered the question, comparing it to the reference.
17+
Be sure to take the conversation history into account.
18+
19+
If the Agent answer is correct, you will output a JSON object with "eval_passed" equal true, like this:
20+
{"correctness" : true}
21+
If the Agent answer is wrong, you will return "eval_passed" equal false and provide a reason:
22+
{"correctness": false, "correctness_reason": "The agent stated that X but should have said Y"}
2023
"""
2124

22-
CORRECTNESS_INPUT_TEMPLATE = """<question>
23-
{question}
24-
</question>
25+
CORRECTNESS_INPUT_TEMPLATE = """
26+
### AGENT DESCRIPTION
27+
{description}
2528
26-
<agent_answer>
27-
{agent_answer}
28-
</agent_answer>
29+
### CONVERSATION
30+
{conversation}
2931
30-
<ground_truth>
31-
{ground_truth}
32-
</ground_truth>
33-
"""
32+
### AGENT ANSWER
33+
{answer}
3434
35+
### REFERENCE ANSWER
36+
{reference_answer}
37+
"""
3538

3639
CORRECTNESS_TRUE_EXAMPLE_INPUT = CORRECTNESS_INPUT_TEMPLATE.format(
37-
question="What is the capital of France?", agent_answer="The capital of France is Paris.", ground_truth="Paris."
40+
description="A chatbot for an ecommerce website, helping users to track their orders and solve issues",
41+
conversation="<user>Which countries do you ship to?</user>",
42+
answer="We ship our products across all United States.",
43+
reference_answer="We ship our products to the United States, Canada, and Mexico.",
3844
)
3945

40-
CORRECTNESS_TRUE_EXAMPLE_OUTPUT = """{"correctness": true, "correctness_reason": ""}"""
46+
CORRECTNESS_TRUE_EXAMPLE_OUTPUT = """{"correctness": false, "correctness_reason": "The agent stated that they ship to United States, but should have included Canada and Mexico."}"""
4147

42-
CORRECTNESS_FALSE_EXAMPLE_INPUT = CORRECTNESS_INPUT_TEMPLATE.format(
43-
question="What is the capital of Denmark?",
44-
agent_answer="The capital of Denmark is Paris.",
45-
ground_truth="Copenhagen.",
46-
)
48+
CORRECTNESS_TRUE_EXAMPLE_INPUT_CONV = CORRECTNESS_INPUT_TEMPLATE.format(
49+
description="An educational chatbot for physics students, helping them with homework and explaining concepts",
50+
conversation="""<user>: Hello, I have some trouble understanding the Archimedes' principle.</user>
51+
52+
<assistant>Hi, sure I can help you with that, what do you want to know?</assistant>
4753
48-
CORRECTNESS_FALSE_EXAMPLE_OUTPUT = (
49-
"""{"correctness": false, "correctness_reason": "The capital of Denmark is Copenhagen, not Paris."}"""
54+
<user>Where does it act?</user>""",
55+
answer="The Archimedes' principle acts at the center of gravity of an object.",
56+
reference_answer="The Archimedes' principle acts at the center of buoyancy of an object. It is the center of gravity of the displaced fluid.",
5057
)
5158

59+
CORRECTNESS_TRUE_EXAMPLE_OUTPUT_CONV = """{"correctness": false, "correctness_reason": "The agent stated that the Archimedes' principle acts at the center of gravity of an object, but should have said that it acts at the center of buoyancy of the object."}"""
60+
5261

5362
class CorrectnessMetric(Metric):
5463
def __init__(self, name: str, llm_client: LLMClient = None, agent_description: Optional[str] = None):
@@ -78,18 +87,22 @@ def __call__(self, question_sample: dict, answer: AgentAnswer) -> dict:
7887
messages=[
7988
ChatMessage(
8089
role="system",
81-
content=CORRECTNESS_EVALUATION_SYSTEM_PROMPT.format(agent_description=self.agent_description),
90+
content=CORRECTNESS_EVALUATION_SYSTEM_PROMPT,
8291
),
8392
ChatMessage(role="user", content=CORRECTNESS_TRUE_EXAMPLE_INPUT),
8493
ChatMessage(role="assistant", content=CORRECTNESS_TRUE_EXAMPLE_OUTPUT),
85-
ChatMessage(role="user", content=CORRECTNESS_FALSE_EXAMPLE_INPUT),
86-
ChatMessage(role="assistant", content=CORRECTNESS_FALSE_EXAMPLE_OUTPUT),
94+
ChatMessage(role="user", content=CORRECTNESS_TRUE_EXAMPLE_INPUT_CONV),
95+
ChatMessage(role="assistant", content=CORRECTNESS_TRUE_EXAMPLE_OUTPUT_CONV),
8796
ChatMessage(
8897
role="user",
8998
content=CORRECTNESS_INPUT_TEMPLATE.format(
90-
question=question_sample.question,
91-
agent_answer=answer.message,
92-
ground_truth=question_sample.reference_answer,
99+
conversation=format_conversation(
100+
question_sample.conversation_history
101+
+ [{"role": "user", "content": question_sample.question}]
102+
),
103+
answer=answer.message,
104+
reference_answer=question_sample.reference_answer,
105+
description=self.agent_description,
93106
),
94107
),
95108
],

tests/rag/test_qa_testset.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ def make_testset_samples():
7575
question="Where is it from?",
7676
reference_answer="Scarmorza is from Southern Italy.",
7777
reference_context="Scamorza is a Southern Italian cow's milk cheese.",
78-
conversation_history=["Scamorza"],
78+
conversation_history=[{"role": "user", "content": "Scamorza"}],
7979
metadata={
8080
"question_type": "conversational",
8181
"color": "blue",
@@ -173,7 +173,7 @@ def test_testset_samples_property():
173173
"seed_document_id": "1",
174174
}
175175
assert testset.samples[-1].question == "Where is it from?"
176-
assert testset.samples[-1].conversation_history == ["Scamorza"]
176+
assert testset.samples[-1].conversation_history == [{"role": "user", "content": "Scamorza"}]
177177
assert testset.samples[-1].id == "6"
178178
assert testset.samples[-1].metadata == {
179179
"question_type": "conversational",

0 commit comments

Comments
 (0)