Skip to content
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
e0202b0
Added a LLM-as-judge per row evaluator
kevinmessiaen Dec 7, 2023
d3e02fc
Removed wrong `@abstractmethod`
kevinmessiaen Dec 7, 2023
17a825f
Fixed issue with requirements
kevinmessiaen Dec 7, 2023
d311ce2
Code improvement
kevinmessiaen Dec 7, 2023
28c0086
Code improvement
kevinmessiaen Dec 7, 2023
9b30a63
Added test for PerRowRequirementEvaluator
kevinmessiaen Dec 7, 2023
f00c7c7
Added test for PerRowRequirementEvaluator
kevinmessiaen Dec 7, 2023
bb8893d
Made output_ds optional for evaluators
kevinmessiaen Dec 7, 2023
7b854bb
Updated test name
kevinmessiaen Dec 7, 2023
1f7e22a
Merge branch 'main' into feature/gsk-2300-llm-as-a-judge-row-wise-cus…
kevinmessiaen Dec 8, 2023
12521cc
Merge branch 'main' into feature/gsk-2300-llm-as-a-judge-row-wise-cus…
kevinmessiaen Dec 13, 2023
15d5ae2
Merge branch 'main' into feature/gsk-2300-llm-as-a-judge-row-wise-cus…
kevinmessiaen Dec 14, 2023
2337c7a
Removed legacy debugging for test_llm_single_output_against_requirement
kevinmessiaen Dec 18, 2023
88f930d
Merge remote-tracking branch 'origin/feature/gsk-2300-llm-as-a-judge-…
kevinmessiaen Dec 18, 2023
908261e
Merge branch 'main' into feature/gsk-2300-llm-as-a-judge-row-wise-cus…
kevinmessiaen Dec 18, 2023
3b78afb
Fixed tests
kevinmessiaen Dec 18, 2023
f9fa8ac
Merge branch 'main' into feature/gsk-2300-llm-as-a-judge-row-wise-cus…
kevinmessiaen Dec 20, 2023
0c63c81
Merge branch 'main' into feature/gsk-2300-llm-as-a-judge-row-wise-cus…
kevinmessiaen Dec 20, 2023
7336888
Merge branch 'main' into feature/gsk-2300-llm-as-a-judge-row-wise-cus…
andreybavt Dec 20, 2023
f884c3c
Merge branch 'main' into feature/gsk-2300-llm-as-a-judge-row-wise-cus…
Hartorn Dec 20, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions giskard/llm/evaluators/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from .coherency import CoherencyEvaluator
from .plausibility import PlausibilityEvaluator
from .requirements import RequirementEvaluator
from .requirements import RequirementEvaluator, PerRowRequirementEvaluator

__all__ = ["CoherencyEvaluator", "RequirementEvaluator", "PlausibilityEvaluator"]
__all__ = ["CoherencyEvaluator", "RequirementEvaluator", "PerRowRequirementEvaluator", "PlausibilityEvaluator"]
16 changes: 11 additions & 5 deletions giskard/llm/evaluators/base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from dataclasses import dataclass

from typing import Sequence
from typing import Sequence, Optional

from ..client import LLMClient, get_default_client
from ..errors import LLMGenerationError
Expand Down Expand Up @@ -34,6 +34,7 @@ class EvaluationResult:
failure_examples: Sequence[dict]
success_examples: Sequence[dict]
errors: Sequence[dict]
output_ds: Optional[Dataset] = None

@property
def passed(self):
Expand All @@ -60,7 +61,7 @@ def __init__(self, eval_prompt=None, llm_temperature=0.1, llm_client: LLMClient
self.llm_temperature = llm_temperature
self.llm_client = llm_client if llm_client is not None else get_default_client()

def _make_evaluate_prompt(self, model: BaseModel, input_vars, model_output):
def _make_evaluate_prompt(self, model: BaseModel, input_vars, model_output, row_idx):
return self.eval_prompt.format(
model_name=model.meta.name,
model_description=model.meta.description,
Expand All @@ -76,12 +77,15 @@ def evaluate(self, model: BaseModel, dataset: Dataset):

succeeded = []
failed = []
failed_idx = []
errored = []
for input_vars, model_output in zip(
dataset.df.loc[:, model.meta.feature_names].to_dict("records"), model_outputs
for row_index, input_vars, model_output in zip(
dataset.df.index,
dataset.df.loc[:, model.meta.feature_names].to_dict("records"),
model_outputs,
):
sample = {"input_vars": input_vars, "model_output": model_output}
prompt = self._make_evaluate_prompt(model, input_vars, model_output)
prompt = self._make_evaluate_prompt(model, input_vars, model_output, row_index)
funcs = self._make_evaluate_functions(model, input_vars, model_output)
try:
out = self.llm_client.complete(
Expand All @@ -101,9 +105,11 @@ def evaluate(self, model: BaseModel, dataset: Dataset):
if args["passed_test"]:
succeeded.append({"input_vars": input_vars, "model_output": model_output, "reason": args.get("reason")})
else:
failed_idx.append(row_index)
failed.append({"input_vars": input_vars, "model_output": model_output, "reason": args.get("reason")})

return EvaluationResult(
output_ds=dataset.slice(lambda df: df.loc[failed_idx], row_level=False),
failure_examples=failed,
success_examples=succeeded,
errors=errored,
Expand Down
43 changes: 37 additions & 6 deletions giskard/llm/evaluators/requirements.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
from abc import abstractmethod

import pandas as pd
from typing import Sequence

from ...models.base.model import BaseModel
from .base import LLMBasedEvaluator
from ...models.base.model import BaseModel

EVALUATE_PROMPT = """Your role is to test AI models. Your task consists in evaluating if the model behavior satisfies a given set of requirements.

Expand Down Expand Up @@ -30,18 +33,46 @@
"""


class RequirementEvaluator(LLMBasedEvaluator):
class BaseRequirementEvaluator(LLMBasedEvaluator):
"""Base class for requirement evaluation."""

_default_eval_prompt = EVALUATE_PROMPT

def __init__(self, requirements: Sequence[str], *args, **kwargs):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.requirements = requirements

def _make_evaluate_prompt(self, model: BaseModel, input_vars, model_output):
@abstractmethod
def requirements(self, row_idx) -> str:
"""Define the evaluation requirements for a given input."""
...

def _make_evaluate_prompt(self, model: BaseModel, input_vars, model_output, row_idx):
return self.eval_prompt.format(
model_name=model.meta.name,
model_description=model.meta.description,
input_vars=input_vars,
model_output=model_output,
requirements="\n".join([f"- {r}" for r in self.requirements]),
requirements=self.requirements(row_idx),
)


class RequirementEvaluator(BaseRequirementEvaluator):
"""Evaluator for global requirements over the entire dataset."""

def __init__(self, requirements: Sequence[str], *args, **kwargs):
super().__init__(*args, **kwargs)
self.requirements_list = requirements

def requirements(self, row_idx):
return "\n".join([f"- {r}" for r in self.requirements_list])


class PerRowRequirementEvaluator(BaseRequirementEvaluator):
"""Evaluator for requirements evaluated individually for each row in a dataset."""

def __init__(self, requirements_df: pd.DataFrame, *args, **kwargs):
super().__init__(*args, **kwargs)
self.requirements_df = requirements_df

def requirements(self, row_idx):
return "\n".join([f"- {r}" for r in self.requirements_df.iloc[row_idx]])
7 changes: 6 additions & 1 deletion giskard/testing/tests/llm/__init__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,18 @@
from .ground_truth import test_llm_ground_truth_similarity, test_llm_ground_truth
from .hallucination import test_llm_output_coherency
from .injections import LLMCharInjector, test_llm_char_injection, test_llm_prompt_injection
from .output_requirements import test_llm_output_against_requirement, test_llm_single_output_against_requirement
from .output_requirements import (
test_llm_output_against_requirement,
test_llm_single_output_against_requirement,
test_llm_output_against_requirement_per_row,
)

__all__ = [
"test_llm_char_injection",
"LLMCharInjector",
"test_llm_output_against_requirement",
"test_llm_single_output_against_requirement",
"test_llm_output_against_requirement_per_row",
"test_llm_output_coherency",
"test_llm_prompt_injection",
"test_llm_ground_truth_similarity",
Expand Down
75 changes: 51 additions & 24 deletions giskard/testing/tests/llm/output_requirements.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,50 +2,72 @@

import pandas as pd

from .. import debug_description_prefix
from ....datasets.base import Dataset
from ....llm.evaluators import RequirementEvaluator
from ....llm.evaluators import RequirementEvaluator, PerRowRequirementEvaluator
from ....ml_worker.testing.registry.decorators import test
from ....ml_worker.testing.test_result import TestMessage, TestMessageLevel, TestResult
from ....models.base import BaseModel
from ....utils.display import truncate
from .. import debug_description_prefix


def _test_output_against_requirement(model: BaseModel, dataset: Dataset, requirement: str, debug: bool = False):
evaluator = RequirementEvaluator([requirement])
def _test_output_against_requirement(model, dataset, evaluator):
eval_result = evaluator.evaluate(model, dataset)

output_ds = None

if eval_result.failed and debug:
df = pd.DataFrame([ex["input_vars"] for ex in eval_result.failure_examples])
output_ds = Dataset(
df,
name=truncate(f'Failing examples for requirement "{requirement}"'),
column_types=dataset.column_types,
validation=False,
)

messages = []
if eval_result.has_errors:
messages = [TestMessage(TestMessageLevel.ERROR, err["message"]) for err in eval_result.errors]

return TestResult(
passed=eval_result.passed,
output_df=output_ds,
output_ds=[eval_result.output_ds],
metric=len(eval_result.failure_examples),
metric_name="Failing examples",
is_error=eval_result.has_errors,
messages=messages,
)


@test(
name="Per row evaluation of model output using an LLM (LLM-as-a-judge)",
tags=["llm", "llm-as-a-judge"],
debug_description=debug_description_prefix + "that are <b>failing the evaluation criteria</b>.",
)
def test_llm_output_against_requirement_per_row(model: BaseModel, dataset: Dataset, requirement_column: str):
"""Evaluates the model output against a given requirement with another LLM (LLM-as-a-judge).

The model outputs over a given dataset will be validated against the
specified requirement using GPT-4 (note that this requires you to set the
`OPENAI_API_TOKEN` environment variable for the test to run correctly).

Parameters
----------
model : BaseModel
The generative model to test.
dataset : Dataset
A dataset of examples which will be provided as inputs to the model.
requirement_column : str
The column in the dataset containing the requirement to evaluate the model output against. This should be a
clear and explicit requirement that can be interpreted by the LLM, for
example: “The model should decline to answer”, “The model should not
generate content that incites harm or violence”, or “The model should
apologize and explain that it cannot answer questions unrelated to its
scope”.

Returns
-------
TestResult
A TestResult object containing the test result.
"""
return _test_output_against_requirement(
model, dataset, PerRowRequirementEvaluator(dataset.df.loc[:, [requirement_column]])
)


@test(
name="Evaluation of model output using an LLM (LLM-as-a-judge)",
tags=["llm", "llm-as-a-judge"],
debug_description=debug_description_prefix + "that are <b>failing the evaluation criteria</b>.",
)
def test_llm_output_against_requirement(model: BaseModel, dataset: Dataset, requirement: str, debug: bool = False):
def test_llm_output_against_requirement(model: BaseModel, dataset: Dataset, requirement: str):
"""Evaluates the model output against a given requirement with another LLM (LLM-as-a-judge).

The model outputs over a given dataset will be validated against the
Expand All @@ -65,16 +87,13 @@ def test_llm_output_against_requirement(model: BaseModel, dataset: Dataset, requ
generate content that incites harm or violence”, or “The model should
apologize and explain that it cannot answer questions unrelated to its
scope”.
debug : bool
If True and the test fails, a dataset containing the rows that have
failed the evaluation criteria will be included in the test result.

Returns
-------
TestResult
A TestResult object containing the test result.
"""
return _test_output_against_requirement(model, dataset, requirement, debug)
return _test_output_against_requirement(model, dataset, RequirementEvaluator([requirement]))


@test(
Expand Down Expand Up @@ -136,4 +155,12 @@ def test_llm_single_output_against_requirement(
)

# Run normal output requirement test
return _test_output_against_requirement(model, dataset, requirement, debug)
test_result = _test_output_against_requirement(model, dataset, RequirementEvaluator([requirement]))

# Use legacy debug since dataset is not uploaded
if debug and not test_result.passed:
test_result.output_df = test_result.output_ds[0]

test_result.output_ds = None

return test_result
8 changes: 7 additions & 1 deletion tests/llm/evaluators/test_llm_based_evaluators.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
from unittest.mock import Mock

import pandas as pd
import pytest

from giskard.llm.client import LLMFunctionCall, LLMOutput
from giskard.llm.evaluators.base import LLMBasedEvaluator
from giskard.llm.evaluators.plausibility import PlausibilityEvaluator
from giskard.llm.evaluators.requirements import RequirementEvaluator
from giskard.llm.evaluators.requirements import RequirementEvaluator, PerRowRequirementEvaluator
from tests.llm.evaluators.utils import make_eval_dataset, make_mock_model


Expand All @@ -18,6 +19,11 @@
{"eval_prompt": "Test this: {model_name} {model_description} {input_vars} {model_output}"},
),
(RequirementEvaluator, [["Requirement to fulfill"]], {}),
(
PerRowRequirementEvaluator,
[pd.DataFrame({"req": ["This is the first test requirement", "This is the second test requirement"]})],
{},
),
(PlausibilityEvaluator, [], {}),
],
)
Expand Down
37 changes: 36 additions & 1 deletion tests/llm/evaluators/test_requirements_evaluator.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from unittest.mock import Mock

import pandas as pd

from giskard.llm.client import LLMFunctionCall, LLMOutput
from giskard.llm.evaluators import RequirementEvaluator
from giskard.llm.evaluators import RequirementEvaluator, PerRowRequirementEvaluator
from tests.llm.evaluators.utils import make_eval_dataset, make_mock_model


Expand Down Expand Up @@ -30,3 +32,36 @@ def test_evaluator_prompt_contains_requirements():

args = client.complete.call_args_list[0]
assert "This is my test requirement" in args[0][0][0]["content"]


def test_evaluator_prompt_contains_row_requirements():
eval_dataset = make_eval_dataset()
model = make_mock_model()

client = Mock()
client.complete.side_effect = [
LLMOutput(
function_call=LLMFunctionCall(
function="evaluate_model",
args={"passed_test": True},
)
),
LLMOutput(
function_call=LLMFunctionCall(
function="evaluate_model",
args={"passed_test": False, "reason": "For some reason"},
)
),
]

requirement_df = pd.DataFrame(
{"req": ["This is the first test requirement", "This is the second test requirement"]}
)
evaluator = PerRowRequirementEvaluator(requirement_df, llm_client=client)
evaluator.evaluate(model, eval_dataset)

args = client.complete.call_args_list[0]
assert requirement_df.iloc[0]["req"] in args[0][0][0]["content"]

args = client.complete.call_args_list[1]
assert requirement_df.iloc[1]["req"] in args[0][0][0]["content"]
Loading