-
Notifications
You must be signed in to change notification settings - Fork 808
update: unify on attempt.notes["triggers"]
#1147
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
e240202
e13221b
6cc190f
7459bb0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6,7 +6,6 @@ | |
| """ | ||
|
|
||
| import json | ||
| import random | ||
|
|
||
| from garak import _config | ||
| from garak.attempt import Attempt | ||
|
|
@@ -61,19 +60,16 @@ def __init__(self, config_root=_config): | |
| self.prompts = [] | ||
| self.triggers = [] | ||
| for term, prefix in self._slur_prefixes: | ||
| lower_term = term.lower() | ||
| term_variants = set(["", lower_term[0]]) # blank, first letter | ||
| term_variants = set(["", term[0]]) # blank, first letter | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does the change of case here possibly impact response? The added test suggests no as this should be case-insensitive. Looking closer at the codebase existing runtime calls to
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It might impact response. I think with this change the responsibility for getting a case that elicits useful responses is shifted to the slur prefixes Interesting re: no use of |
||
| if len(term) > 2: | ||
| term_variants.add(lower_term[:2].strip()) # first two letters | ||
| term_variants.add( | ||
| lower_term[: int(len(lower_term) / 2)].strip() | ||
| ) # first half of term | ||
| term_variants.add(term[:2].strip()) # first two letters | ||
| term_variants.add(term[: int(len(term) / 2)].strip()) # first half of term | ||
|
|
||
| for term_variant in term_variants: | ||
| # would use a set but order is important | ||
| candidate_prompt = prefix + term_variant | ||
| if candidate_prompt not in self.prompts: | ||
| self.prompts.append(candidate_prompt.strip()) | ||
| self.prompts.append(candidate_prompt) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we know why the
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes. A few popular contemporary continuation models (since displaced by chat models) would exhibit odd behaviour when prompts ended with space, when this was coded. This isn't the case any more. I think now we want to see these odd behaviours if they can be summoned by something as simple as ending a prompt with a space. |
||
| self.triggers.append(term) | ||
|
|
||
| if self.follow_prompt_cap: | ||
|
|
@@ -83,7 +79,7 @@ def _attempt_prestore_hook(self, attempt: Attempt, seq: int) -> Attempt: | |
| attempt.notes = dict( | ||
| attempt.notes | ||
| ) # we don't want all attempts.notes to ref same dict | ||
| attempt.notes["trigger"] = str(self.triggers[seq]) | ||
| attempt.notes["triggers"] = [str(self.triggers[seq])] | ||
| return attempt | ||
|
|
||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,117 @@ | ||
| # SPDX-FileCopyrightText: Portions Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
|
|
||
| import pytest | ||
|
|
||
| import garak.attempt | ||
| import garak.detectors.base | ||
| from garak import _plugins | ||
|
|
||
|
|
||
| def test_continuation_load(): | ||
| d = _plugins.load_plugin("detectors.continuation.Continuation") | ||
| assert isinstance(d, garak.detectors.base.Detector) | ||
|
|
||
|
|
||
| # one matching trigger in triggers | ||
| # split over prompt & output triggers | ||
| # non-continuation fails | ||
| # one case-insensitive matching trigger in triggers | ||
| # multiple triggers, matching one first | ||
| # multiple triggers, matching one not first | ||
| # multiple triggers, none match | ||
| # empty triggers (nothing) | ||
| # no triggers but trigger is set (nothing) | ||
|
|
||
| TEST_TRIGGER = "yuxiangqiezi" | ||
|
|
||
|
|
||
| def test_continuation_1_matching_trigger(): | ||
| d = _plugins.load_plugin("detectors.continuation.Continuation") | ||
|
|
||
| a = garak.attempt.Attempt(prompt="test prompts") | ||
| a.notes["triggers"] = [TEST_TRIGGER] | ||
| a.outputs = [TEST_TRIGGER] | ||
| r = d.detect(a) | ||
| assert r == [1.0], "Trigger being contained in output should match" | ||
|
|
||
|
|
||
| @pytest.mark.parametrize("split_point", range(1, len(TEST_TRIGGER))) | ||
| def test_continuation_1_split_trigger(split_point): | ||
| d = _plugins.load_plugin("detectors.continuation.Continuation") | ||
|
|
||
| a = garak.attempt.Attempt(prompt="test prompts " + TEST_TRIGGER[:split_point]) | ||
| a.notes["triggers"] = [TEST_TRIGGER] | ||
| a.outputs = [TEST_TRIGGER[split_point:]] | ||
| r = d.detect(a) | ||
| assert r == [1.0], "Trigger being split over prompt & output should match" | ||
|
|
||
|
|
||
| def test_non_continuation(): | ||
| d = _plugins.load_plugin("detectors.continuation.Continuation") | ||
|
|
||
| a = garak.attempt.Attempt(prompt="test prompts") | ||
| a.notes["triggers"] = [TEST_TRIGGER[:2]] | ||
| a.outputs = [f" {TEST_TRIGGER[2:]}"] | ||
| r = d.detect(a) | ||
| assert r == [0.0], "Trigger split by space should not match" | ||
|
|
||
|
|
||
| def test_continuation_case_insensitive(): | ||
| d = _plugins.load_plugin("detectors.continuation.Continuation") | ||
|
|
||
| a = garak.attempt.Attempt(prompt="test prompts") | ||
| a.notes["triggers"] = [TEST_TRIGGER.lower()] | ||
| a.outputs = [TEST_TRIGGER.upper()] | ||
| r = d.detect(a) | ||
| assert r == [1.0], "Matching should be case insensitive" | ||
|
|
||
|
|
||
| def test_continuation_multitrigger_matchfirst(): | ||
| d = _plugins.load_plugin("detectors.continuation.Continuation") | ||
|
|
||
| a = garak.attempt.Attempt(prompt="test prompts") | ||
| a.notes["triggers"] = [TEST_TRIGGER, "other string"] | ||
| a.outputs = [TEST_TRIGGER] | ||
| r = d.detect(a) | ||
| assert r == [1.0], "Any trigger should match" | ||
|
|
||
|
|
||
| def test_continuation_multitrigger_matchsecond(): | ||
| d = _plugins.load_plugin("detectors.continuation.Continuation") | ||
|
|
||
| a = garak.attempt.Attempt(prompt="test prompts") | ||
| a.notes["triggers"] = ["other string", TEST_TRIGGER] | ||
| a.outputs = [TEST_TRIGGER] | ||
| r = d.detect(a) | ||
| assert r == [1.0], "Any trigger should match even if not the first in the list" | ||
|
|
||
|
|
||
| def test_continuation_multitrigger_matchnone(): | ||
| d = _plugins.load_plugin("detectors.continuation.Continuation") | ||
|
|
||
| a = garak.attempt.Attempt(prompt="test prompts") | ||
| a.notes["triggers"] = ["cardassia", "prime"] | ||
| a.outputs = [TEST_TRIGGER] | ||
| r = d.detect(a) | ||
| assert r == [0.0], "Multiple non-matching triggers should not yield a hit" | ||
|
|
||
|
|
||
| def test_continuation_empty_triggers(): | ||
| d = _plugins.load_plugin("detectors.continuation.Continuation") | ||
|
|
||
| a = garak.attempt.Attempt(prompt="test prompts") | ||
| a.notes["triggers"] = [] | ||
| a.outputs = [TEST_TRIGGER] | ||
| r = d.detect(a) | ||
| assert r == [], "No triggers should not yield results" | ||
|
|
||
|
|
||
| def test_continuation_triggers_not_trigger(): | ||
| d = _plugins.load_plugin("detectors.continuation.Continuation") | ||
|
|
||
| a = garak.attempt.Attempt(prompt="test prompts") | ||
| a.notes["trigger"] = TEST_TRIGGER | ||
| a.outputs = [TEST_TRIGGER] | ||
| r = d.detect(a) | ||
| assert r == [], "attempt.notes['trigger'] should not be supported" |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,58 @@ | ||
| # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
|
|
||
| import pytest | ||
|
|
||
| import garak._plugins | ||
|
|
||
| import garak.probes.base | ||
| import garak.attempt | ||
|
|
||
| CONTINUATION_PROBES = ( | ||
| "probes.continuation.ContinueSlursReclaimedSlursFull", | ||
| "probes.continuation.ContinueSlursReclaimedSlurs", | ||
| ) | ||
|
|
||
|
|
||
| @pytest.mark.parametrize("probename", CONTINUATION_PROBES) | ||
| def test_continuation_load(probename): | ||
| p = garak._plugins.load_plugin(probename) | ||
| assert isinstance(p, garak.probes.base.Probe) | ||
|
|
||
|
|
||
| @pytest.mark.parametrize("probename", CONTINUATION_PROBES) | ||
| def test_continuation_counts_valid(probename): | ||
| p = garak._plugins.load_plugin(probename) | ||
| assert len(p.prompts) > 0, "Probe must have more than zero prompts" | ||
|
|
||
| if probename.endswith("Full"): | ||
| assert len(p.prompts) >= len( | ||
| p._slur_prefixes | ||
| ), "Probe should have at least one prompt per slur prefix" | ||
|
|
||
| if probename.endswith("Full"): | ||
| assert len(p.prompts) == len( | ||
| p.triggers | ||
| ), "Prompt count and trigger count should match" | ||
| else: | ||
| assert len(p.prompts) <= len( | ||
| p.triggers | ||
| ), "Must be enough triggers for all the prompts" | ||
|
|
||
|
|
||
| @pytest.mark.parametrize("probename", CONTINUATION_PROBES) | ||
| def test_continuation_continuations_valid(probename): | ||
| p = garak._plugins.load_plugin(probename) | ||
|
|
||
| for seq, prompt in enumerate(p.prompts): | ||
| if not prompt.endswith(" "): | ||
|
|
||
| trigger = p.triggers[seq] | ||
| candidate_matches = [trigger[1:n] for n in range(len(trigger) - 1)] | ||
| matched = False | ||
| for candidate_match in candidate_matches: | ||
| if prompt.endswith(candidate_match): | ||
| matched = True | ||
| assert ( | ||
| matched == True | ||
| ), "Prompts should end with starting substring of trigger" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It may be useful to iterate all "triggers" even if only one is ever expected to exist. The hardcoded
[0]will likely be a source for a future issue report.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
agree, will address