Skip to content

Commit 3bea570

Browse files
authored
Merge pull request #1861 from Giskard-AI/task/out-of-kb-questions-gsk-3372
RAG Toolkit: Out of KB questions [GSK-3372]
2 parents 5ff5082 + 72cb247 commit 3bea570

File tree

3 files changed

+169
-6
lines changed

3 files changed

+169
-6
lines changed

giskard/rag/question_generators/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from .conversational_questions import ConversationalQuestionsGenerator, conversational_questions
44
from .distracting_questions import DistractingQuestionsGenerator, distracting_questions
55
from .double_questions import DoubleQuestionsGenerator, double_questions
6+
from .oos_questions import OutOfScopeGenerator, oos_questions
67
from .question_types import COMPONENT_DESCRIPTIONS, QUESTION_ATTRIBUTION, RAGComponents
78
from .simple_questions import SimpleQuestionsGenerator, simple_questions
89
from .situational_questions import SituationalQuestionsGenerator, situational_questions
@@ -18,10 +19,12 @@
1819
"DistractingQuestionsGenerator",
1920
"SituationalQuestionsGenerator",
2021
"DoubleQuestionsGenerator",
22+
"OutOfScopeGenerator",
2123
"simple_questions",
2224
"complex_questions",
2325
"conversational_questions",
2426
"distracting_questions",
2527
"situational_questions",
2628
"double_questions",
29+
"oos_questions",
2730
]
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
import uuid
2+
3+
from ..knowledge_base import KnowledgeBase
4+
from .base import GenerateFromSingleQuestionMixin, _LLMBasedQuestionGenerator
5+
from .prompt import QAGenerationPrompt
6+
7+
OOS_PROMPT = """
8+
You are a powerful auditor and mindful judger, your role is to generate question from a given context and
9+
add some fake or non-existing details to the context to check whether the agent you are auditing is capable of answering questions
10+
which have no direct answer in the provided context.
11+
12+
The agent you are auditing is desribed bellow:
13+
{agent_description}
14+
15+
There are your tasks, you should finish them step by step:
16+
1. Select one fact from the context.
17+
2. Imagine some fake details not present in the whole provided context but should be plausible based on the detail you selected in the previous step.
18+
3. Isolate this fake detail into a single sentence.
19+
4. Generate an open question asking about this new detail, make sure the question is relevant and can not be answered by the information in the context.
20+
21+
The generated question should be in the following language: {language}
22+
23+
You will first be provided with an example, followed by the user input. Read the example thoroughly and take inspiration of it but do not use information or name from the example in your answers.
24+
You will return the isolated detail/fact and the question based exclusively on the new added isolated context.
25+
You must output a single JSON object with keys 'selected_fact', 'fake_fact' and 'question' , without any other wrapping text or markdown and everything is in low letter. Make sure you only return valid JSON.
26+
"""
27+
28+
OOS_QUESTION_EXAMPLE_INPUT = """
29+
Paul usually go to the market at 8:00 AM. He starts with the grocery store and then goes to the bakery.
30+
He enjoy buying a fresh baguette every morning at the bakery. The bakery is located at the corner of his street."""
31+
32+
OOS_QUESTION_EXAMPLE_OUTPUT = """
33+
{
34+
"selected_fact": "Paul likes to buy a baguette every day.",
35+
"fake_fact": "Paul Graham pays 1 euro for a baguette",
36+
"question": "How much does Paul pay for his baguette?"
37+
}
38+
"""
39+
40+
DUMMY_ANSWER = "This question can not be answered by the context. No sufficient information is provided in the context to answer this question."
41+
42+
43+
class OutOfScopeGenerator(GenerateFromSingleQuestionMixin, _LLMBasedQuestionGenerator):
44+
"""
45+
Out of Knowledge Base question generator that generates questions from a KnowledgeBase.
46+
47+
Parameters
48+
----------
49+
context_neighbors: int, optional
50+
Number of context neighbors to use for question generation.
51+
context_similarity_threshold: float, optional
52+
Similarity threshold to keep neighboring document during question generation.
53+
context_window_length: int, optional
54+
Context window length of the llm used in the `llm_client` of the generator.
55+
llm_client: LLMClient, optional
56+
The LLM client to use for question generation. If not specified, a default openai client will be used.
57+
llm_temperature: float, optional
58+
The temperature to use in the LLM for question generation. The default is 0.5.
59+
"""
60+
61+
_OOS_question_generation_prompt = QAGenerationPrompt(
62+
system_prompt=OOS_PROMPT,
63+
example_input=OOS_QUESTION_EXAMPLE_INPUT,
64+
example_output=OOS_QUESTION_EXAMPLE_OUTPUT,
65+
)
66+
67+
_question_type = "out of scope"
68+
69+
def generate_single_question(self, knowledge_base: KnowledgeBase, agent_description: str, language: str) -> dict:
70+
"""
71+
Generate a question from a list of context documents.
72+
73+
Parameters
74+
----------
75+
knowledge_base: KnowledgeBase
76+
The knowledge base to generate the question from.
77+
agent_description: str
78+
The description of the agent to generate the question for.
79+
language: str
80+
The language to generate the question in.
81+
82+
Returns
83+
-------
84+
Tuple[dict, dict]
85+
The generated question and the metadata of the question.
86+
"""
87+
seed_document = knowledge_base.get_random_document()
88+
89+
context_documents = knowledge_base.get_neighbors(
90+
seed_document, self._context_neighbors, self._context_similarity_threshold
91+
)
92+
93+
reference_context = "\n\n".join([f"Document {doc.id}: {doc.content}" for doc in context_documents])
94+
95+
# setup the OOKB question generation prompt
96+
question_messages = self._OOS_question_generation_prompt.to_messages(
97+
system_prompt_input={"agent_description": agent_description, "language": language},
98+
user_input=seed_document.content,
99+
)
100+
101+
generated_qa = self._llm_complete(messages=question_messages)
102+
103+
question_metadata = {
104+
"question_type": self._question_type,
105+
"seed_document_id": seed_document.id,
106+
"fake_fact": generated_qa["fake_fact"],
107+
}
108+
109+
question = {
110+
"id": str(uuid.uuid4()),
111+
"question": generated_qa["question"],
112+
"reference_answer": DUMMY_ANSWER,
113+
"reference_context": reference_context,
114+
"conversation_history": [],
115+
"metadata": question_metadata,
116+
}
117+
118+
return question
119+
120+
121+
oos_questions = OutOfScopeGenerator()

tests/rag/test_question_generators.py

Lines changed: 45 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
ConversationalQuestionsGenerator,
88
DistractingQuestionsGenerator,
99
DoubleQuestionsGenerator,
10+
OutOfScopeGenerator,
1011
SimpleQuestionsGenerator,
1112
SituationalQuestionsGenerator,
1213
)
@@ -15,7 +16,6 @@
1516
def test_simple_question_generation():
1617
knowledge_base = Mock()
1718
llm_client = Mock()
18-
llm_client.complete = Mock()
1919
llm_client.complete.side_effect = [
2020
LLMMessage(
2121
role="assistant",
@@ -55,7 +55,6 @@ def test_simple_question_generation():
5555

5656
def test_complex_question_generation():
5757
llm_client = Mock()
58-
llm_client.complete = Mock()
5958
llm_client.complete.side_effect = [
6059
LLMMessage(
6160
role="assistant",
@@ -99,7 +98,6 @@ def test_complex_question_generation():
9998

10099
def test_distracting_question_generation():
101100
llm_client = Mock()
102-
llm_client.complete = Mock()
103101
llm_client.complete.side_effect = [
104102
LLMMessage(
105103
role="assistant",
@@ -147,7 +145,6 @@ def test_distracting_question_generation():
147145

148146
def test_situational_question_generation():
149147
llm_client = Mock()
150-
llm_client.complete = Mock()
151148
llm_client.complete.side_effect = [
152149
LLMMessage(
153150
role="assistant",
@@ -195,7 +192,6 @@ def test_situational_question_generation():
195192

196193
def test_double_question_generation():
197194
llm_client = Mock()
198-
llm_client.complete = Mock()
199195
llm_client.complete.side_effect = [
200196
LLMMessage(
201197
role="assistant",
@@ -242,7 +238,6 @@ def test_double_question_generation():
242238

243239
def test_conversational_question_generation():
244240
llm_client = Mock()
245-
llm_client.complete = Mock()
246241
llm_client.complete.side_effect = [
247242
LLMMessage(
248243
role="assistant",
@@ -285,3 +280,47 @@ def test_conversational_question_generation():
285280
]
286281
assert question["metadata"]["question_type"] == "conversational"
287282
assert question["metadata"]["seed_document_id"] == 2
283+
284+
285+
def test_oos_question_generation():
286+
knowledge_base = Mock()
287+
llm_client = Mock()
288+
289+
llm_client.complete.side_effect = [
290+
LLMMessage(
291+
role="assistant",
292+
content='{"selected_fact": "Paul Graham liked to buy a baguette every day at the local market.", "fake_fact": "Paul Graham paid 1 USD for a baguette", "question": "How much did Paul pay for the baguette?"}',
293+
)
294+
]
295+
296+
documents = [
297+
Document(dict(content="Paul Graham liked to buy a baguette every day at the local market."), idx=1),
298+
Document(dict(content="Cheese is made of milk."), idx=2),
299+
Document(dict(content="Milk is produced by cows, goats or sheep."), idx=3),
300+
]
301+
knowledge_base.get_random_document = Mock(
302+
return_value=Document(dict(content="Paul Graham liked to buy a baguette every day at the local market."), idx=1)
303+
)
304+
knowledge_base.get_neighbors = Mock(return_value=documents)
305+
306+
question_generator = OutOfScopeGenerator(llm_client=llm_client)
307+
308+
question = list(
309+
question_generator.generate_questions(
310+
knowledge_base=knowledge_base, num_questions=1, agent_description="Test", language="en"
311+
)
312+
)[0]
313+
314+
assert question["question"] == "How much did Paul pay for the baguette?"
315+
assert isinstance(question["id"], str)
316+
assert (
317+
question["reference_answer"]
318+
== "This question can not be answered by the context. No sufficient information is provided in the context to answer this question."
319+
)
320+
assert (
321+
question["reference_context"]
322+
== "Document 1: Paul Graham liked to buy a baguette every day at the local market.\n\nDocument 2: Cheese is made of milk.\n\nDocument 3: Milk is produced by cows, goats or sheep."
323+
)
324+
assert question["conversation_history"] == []
325+
assert question["metadata"]["question_type"] == "out of scope"
326+
assert question["metadata"]["seed_document_id"] == 1

0 commit comments

Comments
 (0)