Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions giskard/llm/generators/adversarial.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from ...models.base.model import BaseModel
from ...utils.display import truncate
from .base import BaseDataGenerator

GENERATE_INPUTS_PROMPT = """
Expand Down Expand Up @@ -42,6 +43,9 @@ def __init__(self, issue_description, requirement, *args, **kwargs):
self.issue_description = issue_description
self.requirement = requirement

def _make_dataset_name(self, model: BaseModel, num_samples):
return truncate(f"Adversarial Examples for requirement “{self.requirement}”")

def _make_generate_input_prompt(self, model: BaseModel, num_inputs: int):
return self.prompt.format(
issue_description=self.issue_description,
Expand Down
4 changes: 4 additions & 0 deletions giskard/llm/generators/implausible.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from ...models.base import BaseModel
from ...utils.display import truncate
from .base import BaseDataGenerator


Expand All @@ -24,5 +25,8 @@ class ImplausibleDataGenerator(BaseDataGenerator):
Think step by step and then call the `generate_inputs` function with the generated inputs. You must generate {num_samples} inputs.
"""

def _make_dataset_name(self, model: BaseModel, num_samples):
return truncate(f"Implausibility Examples for {model.meta.name}")

def _make_dataset_name(self, model: BaseModel, num_samples):
return f"Synthetic Implausible Data for {model.meta.name}"
12 changes: 10 additions & 2 deletions giskard/llm/generators/sycophancy.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,15 @@ def generate_dataset(self, model: BaseModel, num_samples=10, column_types=None):
except (AttributeError, KeyError) as err:
raise LLMGenerationError("Could not parse generated inputs") from err

dataset_1 = Dataset(pd.DataFrame([p["input_version_1"] for p in input_pairs]), column_types=column_types)
dataset_2 = Dataset(pd.DataFrame([p["input_version_2"] for p in input_pairs]), column_types=column_types)
dataset_1 = Dataset(
pd.DataFrame([p["input_version_1"] for p in input_pairs]),
name=f"Sycophancy examples for {model.meta.name} (set 1)",
column_types=column_types,
)
dataset_2 = Dataset(
pd.DataFrame([p["input_version_2"] for p in input_pairs]),
name=f"Sycophancy examples for {model.meta.name} (set 2)",
column_types=column_types,
)

return dataset_1, dataset_2
6 changes: 3 additions & 3 deletions giskard/scanner/llm/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from ...llm.generators import AdversarialDataGenerator
from ...llm.testcase import TestcaseRequirementsGenerator
from ...models.base.model import BaseModel
from ...testing.tests.llm.output_requirements import test_llm_output_requirement
from ...testing.tests.llm import test_llm_output_against_requirement
from ..issues import Issue
from ..scanner import logger

Expand Down Expand Up @@ -66,7 +66,7 @@ def make_issue(self, model: BaseModel, dataset: Dataset, requirement: str, examp

def _generate_output_requirement_tests(issue: Issue):
return {
issue.meta["requirement"]: test_llm_output_requirement(
model=issue.model, dataset=issue.dataset, requirement=issue.meta["requirement"]
issue.meta["requirement"]: test_llm_output_against_requirement(
dataset=issue.dataset, requirement=issue.meta["requirement"]
)
}
6 changes: 3 additions & 3 deletions giskard/scanner/llm/llm_basic_sycophancy_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from ...llm.evaluators.coherency import CoherencyEvaluator
from ...llm.generators.sycophancy import SycophancyDataGenerator
from ...models.base.model import BaseModel
from ...testing.tests.llm.hallucination import test_llm_coherency
from ...testing.tests.llm.hallucination import test_llm_output_coherency
from ..decorators import detector
from ..issues import Hallucination, Issue, IssueLevel
from ..scanner import logger
Expand Down Expand Up @@ -61,7 +61,7 @@ def run(self, model: BaseModel, dataset: Dataset) -> Sequence[Issue]:

def _generate_sycophancy_tests(issue: Issue):
return {
"Basic Sycophancy": test_llm_coherency(
model=issue.model, dataset_1=issue.meta["dataset_1"], dataset_2=issue.meta["dataset_2"]
"Basic Sycophancy": test_llm_output_coherency(
dataset_1=issue.meta["dataset_1"], dataset_2=issue.meta["dataset_2"]
)
}
1 change: 0 additions & 1 deletion giskard/scanner/llm/llm_chars_injection_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,6 @@ def _generate_char_injection_tests(issue: Issue):
feature = issue.features[0]
return {
f"Character injection ({issue.meta['special_char'].encode('unicode_escape').decode('ascii')}) in “{feature}”": test_llm_char_injection(
model=issue.model,
dataset=issue.dataset,
characters=[issue.meta["special_char"]],
features=issue.features,
Expand Down
4 changes: 2 additions & 2 deletions giskard/scanner/llm/llm_implausible_output_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from ...llm.evaluators import PlausibilityEvaluator
from ...llm.generators import ImplausibleDataGenerator
from ...models.base.model import BaseModel
from ...testing.tests.llm.hallucination import test_llm_plausibility
from ...testing.tests.llm.hallucination import test_llm_output_plausibility
from ..decorators import detector
from ..issues import Hallucination, Issue, IssueLevel

Expand Down Expand Up @@ -55,4 +55,4 @@ def run(self, model: BaseModel, dataset: Dataset) -> Sequence[Issue]:


def _generate_implausible_output_tests(issue: Issue):
return {"Output plausibility": test_llm_plausibility(model=issue.model, dataset=issue.dataset)}
return {"Output plausibility": test_llm_output_plausibility(dataset=issue.dataset)}
1 change: 0 additions & 1 deletion giskard/scanner/llm/llm_prompt_injection_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,6 @@ def _generate_prompt_injection_tests(issue: Issue):

return {
f"Prompt injection ({issue.meta['domain'].encode('unicode_escape').decode('ascii')})": test_llm_prompt_injection(
model=issue.model,
dataset=prompt_dataset,
threshold=issue.meta["threshold"],
**kwargs,
Expand Down
43 changes: 20 additions & 23 deletions giskard/testing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,54 +38,51 @@
"test_metamorphic_invariance_wilcoxon",
"test_underconfidence_rate",
"test_overconfidence_rate",
"test_llm_response_validation",
"test_llm_individual_response_validation",
]

from giskard.testing.tests.calibration import test_underconfidence_rate, test_overconfidence_rate
from giskard.testing.tests.calibration import test_overconfidence_rate, test_underconfidence_rate
from giskard.testing.tests.drift import (
test_drift_psi,
test_drift_chi_square,
test_drift_ks,
test_drift_earth_movers_distance,
test_drift_prediction_psi,
test_drift_ks,
test_drift_prediction_chi_square,
test_drift_prediction_ks,
test_drift_prediction_earth_movers_distance,
test_drift_prediction_ks,
test_drift_prediction_psi,
test_drift_psi,
)
from giskard.testing.tests.llm import test_llm_response_validation, test_llm_individual_response_validation
from giskard.testing.tests.metamorphic import (
test_metamorphic_invariance,
test_metamorphic_increasing,
test_metamorphic_decreasing,
test_metamorphic_decreasing_t_test,
test_metamorphic_increasing_t_test,
test_metamorphic_invariance_t_test,
test_metamorphic_decreasing_wilcoxon,
test_metamorphic_increasing,
test_metamorphic_increasing_t_test,
test_metamorphic_increasing_wilcoxon,
test_metamorphic_invariance,
test_metamorphic_invariance_t_test,
test_metamorphic_invariance_wilcoxon,
)
from giskard.testing.tests.performance import (
test_auc,
test_f1,
test_accuracy,
test_precision,
test_recall,
test_rmse,
test_mae,
test_r2,
test_auc,
test_diff_accuracy,
test_diff_f1,
test_diff_precision,
test_diff_recall,
test_diff_rmse,
test_f1,
test_mae,
test_precision,
test_r2,
test_recall,
test_rmse,
)
from giskard.testing.tests.statistic import (
test_right_label,
test_output_in_range,
test_disparate_impact,
test_nominal_association,
test_cramer_v,
test_disparate_impact,
test_mutual_information,
test_nominal_association,
test_output_in_range,
test_right_label,
test_theil_u,
)
19 changes: 5 additions & 14 deletions giskard/testing/tests/llm/__init__.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,12 @@
from .hallucination import test_llm_coherency
from .hallucination import test_llm_output_coherency
from .injections import LLMCharInjector, test_llm_char_injection, test_llm_prompt_injection
from .output_requirements import (
EvalTestResult,
test_llm_individual_response_validation,
test_llm_output_requirement,
test_llm_response_validation,
validate_test_case_with_reason,
)
from .output_requirements import test_llm_output_against_requirement, test_llm_single_output_against_requirement

__all__ = [
"test_llm_char_injection",
"LLMCharInjector",
"EvalTestResult",
"test_llm_output_requirement",
"test_llm_response_validation",
"test_llm_individual_response_validation",
"test_llm_coherency",
"validate_test_case_with_reason",
"test_llm_output_against_requirement",
"test_llm_single_output_against_requirement",
"test_llm_output_coherency",
"test_llm_prompt_injection",
]
4 changes: 2 additions & 2 deletions giskard/testing/tests/llm/hallucination.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@


@test(name="LLM Coherency", tags=["llm", "hallucination"])
def test_llm_coherency(
def test_llm_output_coherency(
model: BaseModel, dataset_1: Dataset, dataset_2: Optional[Dataset] = None, eval_prompt: Optional[str] = None
):
"""Tests that the model output is coherent for multiple inputs.
Expand Down Expand Up @@ -40,7 +40,7 @@ def test_llm_coherency(


@test(name="LLM Plausibility", tags=["llm", "hallucination"])
def test_llm_plausibility(model: BaseModel, dataset: Dataset, eval_prompt: Optional[str] = None):
def test_llm_output_plausibility(model: BaseModel, dataset: Dataset, eval_prompt: Optional[str] = None):
"""Tests that the model output is plausible.


Expand Down
2 changes: 1 addition & 1 deletion giskard/testing/tests/llm/injections.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ def test_llm_char_injection(
if not result.passed:
result.output_df = Dataset(
pd.concat(fail_dfs),
name="Test dataset vulnerable to char injection (automatically generated)",
name="Test dataset vulnerable to character injection",
column_types=dataset.column_types,
validation=False,
)
Expand Down
Loading