Giskard-AI · rabah-khalek · Apr 19, 2024 · Apr 19, 2024 · Apr 19, 2024
diff --git a/docs/conf.py b/docs/conf.py
@@ -34,7 +34,7 @@
 
 autoclass_content = "both"
 
-# autodoc_mock_imports = ["giskard.ml_worker.generated"]
+autodoc_mock_imports = ["ragas", "langchain_core"]
 templates_path = ["_templates"]
 exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
 

diff --git a/docs/open_source/testset_generation/rag_evaluation/index.md b/docs/open_source/testset_generation/rag_evaluation/index.md
@@ -129,9 +129,6 @@ test_suite = testset.to_test_suite("My first test suite")
 test_suite.run(model=giskard_model)
 ```
 
-![](./test_suite_widget.png)
-
-
 Note that you can split the test suite on the question metadata values, for instance on each question type. 
 
 ```python

diff --git a/docs/open_source/testset_generation/test_suite_widget.png b/docs/open_source/testset_generation/test_suite_widget.png
diff --git a/docs/reference/rag-toolset/evaluation.rst b/docs/reference/rag-toolset/evaluation.rst
@@ -1,5 +1,5 @@
 Evaluation
-======
+==========
 
 .. autofunction:: giskard.rag.evaluate
 

diff --git a/docs/reference/rag-toolset/index.rst b/docs/reference/rag-toolset/index.rst
@@ -1,5 +1,5 @@
 RAG Evaluation Toolkit
-=============
+======================
 
 .. toctree::
    :maxdepth: 2

diff --git a/docs/reference/rag-toolset/metrics.rst b/docs/reference/rag-toolset/metrics.rst
@@ -1,12 +1,28 @@
-Metrics
-=======
 
+Available Metric functions
+==========================
+
+Correctness
+-----------
+Using LLM as a judge strategy, the correctness metrics check if an answer is correct compared to the reference answer.
+
+.. autofunction:: giskard.rag.metrics.correctness.correctness_metric
+
+RAGAS Metrics
+-------------
+We provide wrappers for some RAGAS metrics. You can implement other RAGAS metrics using the `RAGASMetric` class. 
+.. autofunction:: giskard.rag.metrics.ragas_metrics.ragas_context_precision
+
+.. autofunction:: giskard.rag.metrics.ragas_metrics.ragas_faithfulness
+
+.. autofunction:: giskard.rag.metrics.ragas_metrics.ragas_answer_relevancy
+
+.. autofunction:: giskard.rag.metrics.ragas_metrics.ragas_context_recall
+
+Base Metric
+-----------
 .. autoclass:: giskard.rag.metrics.Metric
     :members:
     :special-members: __call__
 
-.. autoclass:: giskard.rag.metrics.correctness.CorrectnessMetric
-    :members:
 
-.. autoclass:: giskard.rag.metrics.ragas_metrics.RagasMetric
-    :members:
diff --git a/docs/reference/rag-toolset/question_generation.rst b/docs/reference/rag-toolset/question_generation.rst
@@ -18,3 +18,6 @@ Question Generation
 
 .. autoclass:: giskard.rag.question_generators.ConversationalQuestionsGenerator
     :members:
+
+.. autoclass:: giskard.rag.question_generators.OutOfScopeGenerator
+    :members:
diff --git a/docs/reference/rag-toolset/testset_generation.rst b/docs/reference/rag-toolset/testset_generation.rst
@@ -1,5 +1,5 @@
 Testset Generation
-======
+==================
 
 .. autofunction:: giskard.rag.generate_testset
 

diff --git a/giskard/rag/knowledge_base.py b/giskard/rag/knowledge_base.py
@@ -96,8 +96,8 @@ class KnowledgeBase:
         The seed to use for random number generation.
     llm_client: LLMClient, optional:
         The LLM client to use for question generation. If not specified, a default openai client will be used.
-    embedding_model: str = "text-embedding-ada-002"
-        The name of the embedding model to use for the knowledge base. It should match the llm_client available embedding models.
+    embedding_model: BaseEmbedding, optional
+        The giskard embedding model to use for the knowledge base. By default we use giskard default model which is OpenAI "text-embedding-ada-002".
     min_topic_size: int, optional
         The minimum number of document to form a topic inside the knowledge base.
     chunk_size: int = 2048

diff --git a/giskard/rag/metrics/base.py b/giskard/rag/metrics/base.py
@@ -16,19 +16,18 @@ def __init__(self, name: str, llm_client: LLMClient = None) -> None:
     @abstractmethod
     def __call__(self, question_sample: dict, answer: str):
         """
-        Compute the metric on the test set and the answers.
+        Compute the metric on a single question and its associated answer.
 
         Parameters
         ----------
-        testset : QATestset
-            The test set to compare the answers with.
-        answers : Sequence[str]
-            The answers of the agent to evaluate.
+        question_sample : dict
+            A question sample from a QATestset.
+        answer : Sequence[str]
+            The agent answer on that question.
 
         Returns
         -------
         dict
-            The result of the metric. The keys should be the name of the metrics and the
-            values should be the result of the metric for each question/answer pair.
+            The result of the metric. The keys should be the names of the metrics computed.
         """
         pass
diff --git a/giskard/rag/metrics/ragas_metrics.py b/giskard/rag/metrics/ragas_metrics.py
@@ -20,7 +20,6 @@
     from ragas.metrics.base import Metric as BaseRagasMetric
     from ragas.run_config import RunConfig
 
-
 except ImportError as err:
     raise ImportError(
         f"Package {err.name} is missing, it is required for the computation of RAGAS metrics. You can install it with `pip install {err.name}`."
@@ -62,17 +61,6 @@ def embed_documents(self, texts: Sequence[str]) -> Sequence[Sequence[float]]:
 
 
 class RagasMetric(Metric):
-    """
-    A wrapper for RAGAS metrics, so they can be used inside the `~giskard.rag.evaluate` function.
-
-    Parameters
-    ----------
-    name : str
-        The name of the metric.
-    metrics : Union[BaseRagasMetric, Sequence[BaseRagasMetric]]
-        The list of RAGAS metrics to use.
-    """
-
     def __init__(
         self, name: str, metric: BaseRagasMetric, context_window_length: int = 8192, llm_client: LLMClient = None
     ) -> None:
@@ -81,7 +69,7 @@ def __init__(
         self.context_window_length = context_window_length
         self._llm_client = llm_client
 
-    def __call__(self, question_sample, answer) -> dict:
+    def __call__(self, question_sample: dict, answer: str) -> dict:
         llm_client = self._llm_client or get_default_client()
         ragas_llm = RagasLLMWrapper(llm_client, self.context_window_length)
         ragas_embedddings = RagasEmbeddingsWrapper(llm_client)

diff --git a/giskard/rag/report.py b/giskard/rag/report.py
@@ -44,8 +44,8 @@ class RAGReport:
         The testset used to evaluate the agent.
     results : Sequence[dict]
         The evaluation results of the agent's answers. Should be a list of dictionaries with the following keys: "evaluation", "reason", "agent_answer".
-    metrics_results : pd.DataFrame, optional
-        The additional metrics computed during the evaluation. If provided, these metrics will be included in the report.
+    metrics_results : dict, optional
+        The additional metrics computed during the evaluation. If provided, these metrics will be included in the report. The dict should have the following structure: `{"question_id": {"correctness": bool, "correctness_reason": str, "additional_metric": value, ...}}`.
     knowledge_base : KnowledgeBase
         The knowledge base used to create the testset.
     """