Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@

autoclass_content = "both"

# autodoc_mock_imports = ["giskard.ml_worker.generated"]
autodoc_mock_imports = ["ragas", "langchain_core"]
templates_path = ["_templates"]
exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]

Expand Down
3 changes: 0 additions & 3 deletions docs/open_source/testset_generation/rag_evaluation/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -129,9 +129,6 @@ test_suite = testset.to_test_suite("My first test suite")
test_suite.run(model=giskard_model)
```

![](./test_suite_widget.png)


Note that you can split the test suite on the question metadata values, for instance on each question type.

```python
Expand Down
Binary file not shown.
2 changes: 1 addition & 1 deletion docs/reference/rag-toolset/evaluation.rst
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
Evaluation
======
==========

.. autofunction:: giskard.rag.evaluate

Expand Down
2 changes: 1 addition & 1 deletion docs/reference/rag-toolset/index.rst
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
RAG Evaluation Toolkit
=============
======================

.. toctree::
:maxdepth: 2
Expand Down
28 changes: 22 additions & 6 deletions docs/reference/rag-toolset/metrics.rst
Original file line number Diff line number Diff line change
@@ -1,12 +1,28 @@
Metrics
=======

Available Metric functions
==========================

Correctness
-----------
Using LLM as a judge strategy, the correctness metrics check if an answer is correct compared to the reference answer.

.. autofunction:: giskard.rag.metrics.correctness.correctness_metric

RAGAS Metrics
-------------
We provide wrappers for some RAGAS metrics. You can implement other RAGAS metrics using the `RAGASMetric` class.
.. autofunction:: giskard.rag.metrics.ragas_metrics.ragas_context_precision

.. autofunction:: giskard.rag.metrics.ragas_metrics.ragas_faithfulness

.. autofunction:: giskard.rag.metrics.ragas_metrics.ragas_answer_relevancy

.. autofunction:: giskard.rag.metrics.ragas_metrics.ragas_context_recall

Base Metric
-----------
.. autoclass:: giskard.rag.metrics.Metric
:members:
:special-members: __call__

.. autoclass:: giskard.rag.metrics.correctness.CorrectnessMetric
:members:

.. autoclass:: giskard.rag.metrics.ragas_metrics.RagasMetric
:members:
3 changes: 3 additions & 0 deletions docs/reference/rag-toolset/question_generation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,6 @@ Question Generation

.. autoclass:: giskard.rag.question_generators.ConversationalQuestionsGenerator
:members:

.. autoclass:: giskard.rag.question_generators.OutOfScopeGenerator
:members:
2 changes: 1 addition & 1 deletion docs/reference/rag-toolset/testset_generation.rst
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
Testset Generation
======
==================

.. autofunction:: giskard.rag.generate_testset

Expand Down
4 changes: 2 additions & 2 deletions giskard/rag/knowledge_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,8 @@ class KnowledgeBase:
The seed to use for random number generation.
llm_client: LLMClient, optional:
The LLM client to use for question generation. If not specified, a default openai client will be used.
embedding_model: str = "text-embedding-ada-002"
The name of the embedding model to use for the knowledge base. It should match the llm_client available embedding models.
embedding_model: BaseEmbedding, optional
The giskard embedding model to use for the knowledge base. By default we use giskard default model which is OpenAI "text-embedding-ada-002".
min_topic_size: int, optional
The minimum number of document to form a topic inside the knowledge base.
chunk_size: int = 2048
Expand Down
13 changes: 6 additions & 7 deletions giskard/rag/metrics/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,19 +16,18 @@ def __init__(self, name: str, llm_client: LLMClient = None) -> None:
@abstractmethod
def __call__(self, question_sample: dict, answer: str):
"""
Compute the metric on the test set and the answers.
Compute the metric on a single question and its associated answer.

Parameters
----------
testset : QATestset
The test set to compare the answers with.
answers : Sequence[str]
The answers of the agent to evaluate.
question_sample : dict
A question sample from a QATestset.
answer : Sequence[str]
The agent answer on that question.

Returns
-------
dict
The result of the metric. The keys should be the name of the metrics and the
values should be the result of the metric for each question/answer pair.
The result of the metric. The keys should be the names of the metrics computed.
"""
pass
14 changes: 1 addition & 13 deletions giskard/rag/metrics/ragas_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
from ragas.metrics.base import Metric as BaseRagasMetric
from ragas.run_config import RunConfig


except ImportError as err:
raise ImportError(
f"Package {err.name} is missing, it is required for the computation of RAGAS metrics. You can install it with `pip install {err.name}`."
Expand Down Expand Up @@ -62,17 +61,6 @@ def embed_documents(self, texts: Sequence[str]) -> Sequence[Sequence[float]]:


class RagasMetric(Metric):
"""
A wrapper for RAGAS metrics, so they can be used inside the `~giskard.rag.evaluate` function.

Parameters
----------
name : str
The name of the metric.
metrics : Union[BaseRagasMetric, Sequence[BaseRagasMetric]]
The list of RAGAS metrics to use.
"""

def __init__(
self, name: str, metric: BaseRagasMetric, context_window_length: int = 8192, llm_client: LLMClient = None
) -> None:
Expand All @@ -81,7 +69,7 @@ def __init__(
self.context_window_length = context_window_length
self._llm_client = llm_client

def __call__(self, question_sample, answer) -> dict:
def __call__(self, question_sample: dict, answer: str) -> dict:
llm_client = self._llm_client or get_default_client()
ragas_llm = RagasLLMWrapper(llm_client, self.context_window_length)
ragas_embedddings = RagasEmbeddingsWrapper(llm_client)
Expand Down
4 changes: 2 additions & 2 deletions giskard/rag/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ class RAGReport:
The testset used to evaluate the agent.
results : Sequence[dict]
The evaluation results of the agent's answers. Should be a list of dictionaries with the following keys: "evaluation", "reason", "agent_answer".
metrics_results : pd.DataFrame, optional
The additional metrics computed during the evaluation. If provided, these metrics will be included in the report.
metrics_results : dict, optional
The additional metrics computed during the evaluation. If provided, these metrics will be included in the report. The dict should have the following structure: `{"question_id": {"correctness": bool, "correctness_reason": str, "additional_metric": value, ...}}`.
knowledge_base : KnowledgeBase
The knowledge base used to create the testset.
"""
Expand Down