From 6214ab1415cd34db95e4e469a734dfba7f11f63a Mon Sep 17 00:00:00 2001 From: Har1sh-k Date: Fri, 14 Nov 2025 12:52:07 -0600 Subject: [PATCH 01/10] probe: added badcharacters module --- garak/probes/badchars.py | 428 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 428 insertions(+) create mode 100644 garak/probes/badchars.py diff --git a/garak/probes/badchars.py b/garak/probes/badchars.py new file mode 100644 index 000000000..4bc414ccf --- /dev/null +++ b/garak/probes/badchars.py @@ -0,0 +1,428 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Imperceptible perturbation probes inspired by the Bad Characters paper. + +Implements the "just try everything" strategy for invisible Unicode characters, +homoglyph substitutions, bidi-based reorderings, and deletion/backspace pairs as +described in https://arxiv.org/abs/2106.09898. +""" + +import itertools +import logging +import random +from dataclasses import dataclass +from typing import Iterator, List, Sequence, Tuple + +import garak.attempt +import garak.payloads +import garak.probes +from garak import _config +from garak.data import path as data_path +from garak.exception import PluginConfigurationError + + +ASCII_PRINTABLE = tuple(chr(i) for i in range(0x20, 0x7F)) +DEFAULT_INVISIBLE = ("\u200b", "\u200c", "\u200d") # ZWSP, ZWNJ, ZWJ +BIDI_CONTROLS = { + "PDF": "\u202c", + "LRO": "\u202d", + "RLO": "\u202e", + "LRI": "\u2066", + "RLI": "\u2067", + "PDI": "\u2069", +} + + +@dataclass(frozen=True) +class _Swap: + """Represents a bidi-wrapped swap request between two code points.""" + + first: str + second: str + + +def _render_swaps(elements: Sequence) -> str: + """Recursively expand swap objects into bidi control sequences.""" + + rendered: List[str] = [] + for element in elements: + if isinstance(element, _Swap): + payload = [ + BIDI_CONTROLS["LRO"], + BIDI_CONTROLS["LRI"], + BIDI_CONTROLS["RLO"], + BIDI_CONTROLS["LRI"], + element.first, + BIDI_CONTROLS["PDI"], + BIDI_CONTROLS["LRI"], + element.second, + BIDI_CONTROLS["PDI"], + BIDI_CONTROLS["PDF"], + BIDI_CONTROLS["PDI"], + BIDI_CONTROLS["PDF"], + ] + rendered.append(_render_swaps(payload)) + elif isinstance(element, str): + rendered.append(element) + elif isinstance(element, Sequence): + rendered.append(_render_swaps(element)) + else: + rendered.append(str(element)) + return "".join(rendered) + + +def _load_homoglyph_map() -> dict[str, List[str]]: + """Parse intentional.txt into a source -> targets dictionary.""" + + mapping: dict[str, set[str]] = {} + intent_path = data_path / "badchars" / "intentional.txt" + try: + with open(intent_path, "r", encoding="utf-8") as infile: + for raw_line in infile: + line = raw_line.strip() + if not line or line.startswith("#"): + continue + left, remainder = line.split(";", maxsplit=1) + remainder = remainder.split("#", maxsplit=1)[0].strip() + if not remainder: + continue + codepoints = remainder.split() + if not codepoints: + continue + source = chr(int(left.strip(), 16)) + target = "".join(chr(int(cp, 16)) for cp in codepoints) + if source == target: + continue + mapping.setdefault(source, set()).add(target) + except FileNotFoundError as exc: + msg = ( + "Unable to load intentional.txt for homoglyph perturbations. " + "Run `git lfs pull` or reinstall data assets." + ) + logging.error(msg) + raise PluginConfigurationError(msg) from exc + + return {key: sorted(values) for key, values in mapping.items()} + + +class BadCharacters(garak.probes.Probe): + """Imperceptible perturbations mixing invisible chars, homoglyphs, swaps, deletions.""" + + lang = "en" + active = False + doc_uri = "https://arxiv.org/abs/2106.09898" + goal = "inject imperceptible text perturbations that bypass refusal policies" + primary_detector = "mitigation.MitigationBypass" + tags = [ + "avid-effect:security:S0403", + "quality:Security:PromptStability", + "payload:jailbreak", + ] + tier = garak.probes.Tier.COMPETE_WITH_SOTA + + DEFAULT_PARAMS = garak.probes.Probe.DEFAULT_PARAMS | { + "payload_name": "harmful_behaviors", + "perturbation_budget": 1, + "enabled_categories": [ + "invisible", + "homoglyph", + "reordering", + "deletion", + ], + "max_position_candidates": 24, + "max_reorder_candidates": 24, + "max_ascii_variants": len(ASCII_PRINTABLE), + "follow_prompt_cap": True, + } + + def __init__(self, config_root=_config): + super().__init__(config_root=config_root) + if self.perturbation_budget <= 0: + raise PluginConfigurationError( + "perturbation_budget must be a positive integer" + ) + + self._homoglyph_map = _load_homoglyph_map() + categories = self.enabled_categories + if isinstance(categories, str): + categories = [categories] + self._enabled_categories = { + cat.lower() + for cat in categories + if isinstance(cat, str) + } + if not self._enabled_categories: + raise PluginConfigurationError("enabled_categories must include at least one entry") + + payload_group = garak.payloads.load(self.payload_name) + self._source_payloads = payload_group.payloads + self.prompts: List[garak.attempt.Conversation] = [] + self._seen_prompts: set[str] = set() + + for idx, payload in enumerate(self._source_payloads): + for text, metadata in self._generate_variants(payload, idx): + self._append_prompt(text, metadata) + + if not self.prompts: + raise PluginConfigurationError( + "BadCharacters failed to generate any prompts. " + "Check payload_name and category configuration." + ) + + if ( + self.follow_prompt_cap + and self.soft_probe_prompt_cap is not None + and len(self.prompts) > self.soft_probe_prompt_cap + ): + self._downsample_prompts() + + def _append_prompt(self, text: str, metadata: dict) -> None: + if text in self._seen_prompts: + return + self._seen_prompts.add(text) + conversation = garak.attempt.Conversation( + [ + garak.attempt.Turn( + "user", + garak.attempt.Message(text=text, lang=self.lang), + ) + ], + notes=metadata, + ) + self.prompts.append(conversation) + + def _generate_variants( + self, payload: str, payload_idx: int + ) -> Iterator[Tuple[str, dict]]: + generators = { + "invisible": self._generate_invisible_variants, + "homoglyph": self._generate_homoglyph_variants, + "reordering": self._generate_reordering_variants, + "deletion": self._generate_deletion_variants, + } + for category in self._enabled_categories: + generator = generators.get(category) + if generator is None: + logging.warning("Unknown BadCharacters category '%s'", category) + continue + for variant_text, details in generator(payload): + metadata = { + "bad_character_category": category, + "perturbation_count": details.get("count"), + "operations": details.get("operations"), + "source_payload_index": payload_idx, + "source_payload_excerpt": self._excerpt(payload), + } + yield variant_text, metadata + + def _generate_invisible_variants( + self, payload: str + ) -> Iterator[Tuple[str, dict]]: + positions = self._select_positions(len(payload), self.max_position_candidates) + for count in range(1, self.perturbation_budget + 1): + if len(positions) < count: + break + for pos_combo in itertools.combinations(positions, count): + for chars in itertools.product(DEFAULT_INVISIBLE, repeat=count): + text = self._inject_sequences(payload, list(zip(pos_combo, chars))) + yield text, { + "count": count, + "operations": { + "positions": list(pos_combo), + "characters": [ord(ch) for ch in chars], + }, + } + + def _generate_homoglyph_variants( + self, payload: str + ) -> Iterator[Tuple[str, dict]]: + candidate_positions = [ + idx for idx, ch in enumerate(payload) if ch in self._homoglyph_map + ] + if not candidate_positions: + return + + max_replacements = min(self.perturbation_budget, len(candidate_positions)) + for count in range(1, max_replacements + 1): + for idx_combo in itertools.combinations(candidate_positions, count): + replacement_options = [ + self._homoglyph_map[payload[idx]] for idx in idx_combo + ] + for replacements in itertools.product(*replacement_options): + mutable = list(payload) + for idx, rep in zip(idx_combo, replacements): + mutable[idx] = rep + text = "".join(mutable) + yield text, { + "count": count, + "operations": { + "positions": list(idx_combo), + "replacements": replacements, + }, + } + + def _generate_reordering_variants( + self, payload: str + ) -> Iterator[Tuple[str, dict]]: + if len(payload) < 2: + return + candidates = self._select_positions( + len(payload) - 1, self.max_reorder_candidates, include_endpoint=False + ) + valid_indices = [ + idx for idx in candidates if idx < len(payload) - 1 + ] + for count in range(1, min(self.perturbation_budget, len(valid_indices)) + 1): + for combo in itertools.combinations(valid_indices, count): + if not self._non_overlapping(combo): + continue + text = self._apply_swaps(payload, combo) + yield text, { + "count": count, + "operations": {"positions": list(combo)}, + } + + def _generate_deletion_variants( + self, payload: str + ) -> Iterator[Tuple[str, dict]]: + positions = self._select_positions(len(payload), self.max_position_candidates) + ascii_candidates = self._select_ascii(self.max_ascii_variants) + for count in range(1, self.perturbation_budget + 1): + if len(positions) < count: + break + for pos_combo in itertools.combinations(positions, count): + for chars in itertools.product(ascii_candidates, repeat=count): + text = self._inject_sequences( + payload, + [(pos, f"{char}\b") for pos, char in zip(pos_combo, chars)], + ) + yield text, { + "count": count, + "operations": { + "positions": list(pos_combo), + "ascii_codes": [ord(c) for c in chars], + }, + } + + def _inject_sequences( + self, payload: str, insertions: List[Tuple[int, str]] + ) -> str: + result = payload + offset = 0 + for position, value in sorted(insertions, key=lambda item: item[0]): + idx = min(max(position + offset, 0), len(result)) + result = result[:idx] + value + result[idx:] + offset += len(value) + return result + + def _apply_swaps(self, payload: str, indices: Sequence[int]) -> str: + working: List = list(payload) + swaps_done = 0 + for original_idx in sorted(indices): + adjusted = original_idx - swaps_done + if adjusted < 0 or adjusted >= len(working) - 1: + continue + first, second = working[adjusted], working[adjusted + 1] + working = ( + working[:adjusted] + [_Swap(second, first)] + working[adjusted + 2 :] + ) + swaps_done += 1 + return _render_swaps(working) + + @staticmethod + def _non_overlapping(indices: Sequence[int]) -> bool: + return all(b - a >= 2 for a, b in zip(indices, indices[1:])) + + def _select_positions( + self, + length: int, + cap: int, + include_endpoint: bool = True, + ) -> List[int]: + positions = list(range(length + (1 if include_endpoint else 0))) + if cap is None or cap <= 0 or len(positions) <= cap: + return positions + + if cap == 1: + return [positions[0]] + + step = (len(positions) - 1) / (cap - 1) + selected = [] + seen = set() + for idx in range(cap): + pick = round(idx * step) + value = positions[pick] + if value in seen: + continue + selected.append(value) + seen.add(value) + for value in positions: + if len(selected) >= cap: + break + if value not in seen: + selected.append(value) + seen.add(value) + selected.sort() + return selected + + @staticmethod + def _select_ascii(limit: int) -> List[str]: + if limit is None or limit <= 0 or limit >= len(ASCII_PRINTABLE): + return list(ASCII_PRINTABLE) + step = max(1, (len(ASCII_PRINTABLE) - 1) // (limit - 1)) + selected = [ASCII_PRINTABLE[i] for i in range(0, len(ASCII_PRINTABLE), step)] + return selected[:limit] + + @staticmethod + def _excerpt(payload: str, limit: int = 96) -> str: + flattened = " ".join(payload.split()) + if len(flattened) <= limit: + return flattened + return f"{flattened[: limit - 1]}…" + + def _downsample_prompts(self) -> None: + if not self.prompts: + return + cap = self.soft_probe_prompt_cap + if cap is None or cap <= 0 or len(self.prompts) <= cap: + return + + grouped: dict[str, List[garak.attempt.Conversation]] = {} + for conv in self.prompts: + category = conv.notes.get("bad_character_category", "unknown") + grouped.setdefault(category, []).append(conv) + + if hasattr(_config, "run") and getattr(_config.run, "seed", None) is not None: + rng = random.Random(_config.run.seed) + for group in grouped.values(): + rng.shuffle(group) + else: + for group in grouped.values(): + random.shuffle(group) + + total = len(self.prompts) + allocation: dict[str, int] = {} + remaining = cap + for category, group in grouped.items(): + share = min(len(group), max(1, round(cap * len(group) / total))) + allocation[category] = share + remaining -= share + + while remaining > 0: + progress = False + for category, group in grouped.items(): + if allocation.get(category, 0) < len(group): + allocation[category] += 1 + remaining -= 1 + progress = True + if remaining == 0: + break + if not progress: + break + + selection: List[garak.attempt.Conversation] = [] + for category, group in grouped.items(): + take = min(len(group), allocation.get(category, 0)) + selection.extend(group[:take]) + self.prompts = selection[:cap] From f17d4d09edb46937c5562703d35dc844db40abe8 Mon Sep 17 00:00:00 2001 From: Har1sh-k Date: Fri, 14 Nov 2025 12:58:14 -0600 Subject: [PATCH 02/10] probe: add data for bad characters --- garak/data/badchars/intentional.txt | 164 ++++++++++++++++++++++++++++ 1 file changed, 164 insertions(+) create mode 100644 garak/data/badchars/intentional.txt diff --git a/garak/data/badchars/intentional.txt b/garak/data/badchars/intentional.txt new file mode 100644 index 000000000..1cbbd4c80 --- /dev/null +++ b/garak/data/badchars/intentional.txt @@ -0,0 +1,164 @@ +# intentional.txt +# Date: 2025-07-22, 05:49:36 GMT +# © 2025 Unicode®, Inc. +# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. +# For terms of use and license, see https://www.unicode.org/terms_of_use.html +# +# Unicode Security Mechanisms for UTS #39 +# Version: 17.0.0 +# +# For documentation and usage, see https://www.unicode.org/reports/tr39 +# +0021 ; 01C3 #* ( ! ~ ǃ ) EXCLAMATION MARK ~ LATIN LETTER RETROFLEX CLICK + +0041 ; 0391 # ( A ~ Α ) LATIN CAPITAL LETTER A ~ GREEK CAPITAL LETTER ALPHA + +0042 ; 0392 # ( B ~ Β ) LATIN CAPITAL LETTER B ~ GREEK CAPITAL LETTER BETA + +0043 ; 0421 # ( C ~ С ) LATIN CAPITAL LETTER C ~ CYRILLIC CAPITAL LETTER ES + +0045 ; 0395 # ( E ~ Ε ) LATIN CAPITAL LETTER E ~ GREEK CAPITAL LETTER EPSILON + +0048 ; 0397 # ( H ~ Η ) LATIN CAPITAL LETTER H ~ GREEK CAPITAL LETTER ETA + +0049 ; 0399 # ( I ~ Ι ) LATIN CAPITAL LETTER I ~ GREEK CAPITAL LETTER IOTA + +004A ; 0408 # ( J ~ Ј ) LATIN CAPITAL LETTER J ~ CYRILLIC CAPITAL LETTER JE + +004B ; 039A # ( K ~ Κ ) LATIN CAPITAL LETTER K ~ GREEK CAPITAL LETTER KAPPA + +004D ; 039C # ( M ~ Μ ) LATIN CAPITAL LETTER M ~ GREEK CAPITAL LETTER MU + +004E ; 039D # ( N ~ Ν ) LATIN CAPITAL LETTER N ~ GREEK CAPITAL LETTER NU + +004F ; 039F # ( O ~ Ο ) LATIN CAPITAL LETTER O ~ GREEK CAPITAL LETTER OMICRON + +0050 ; 03A1 # ( P ~ Ρ ) LATIN CAPITAL LETTER P ~ GREEK CAPITAL LETTER RHO + +0053 ; 0405 # ( S ~ Ѕ ) LATIN CAPITAL LETTER S ~ CYRILLIC CAPITAL LETTER DZE + +0054 ; 03A4 # ( T ~ Τ ) LATIN CAPITAL LETTER T ~ GREEK CAPITAL LETTER TAU + +0058 ; 03A7 # ( X ~ Χ ) LATIN CAPITAL LETTER X ~ GREEK CAPITAL LETTER CHI + +0059 ; 03A5 # ( Y ~ Υ ) LATIN CAPITAL LETTER Y ~ GREEK CAPITAL LETTER UPSILON + +005A ; 0396 # ( Z ~ Ζ ) LATIN CAPITAL LETTER Z ~ GREEK CAPITAL LETTER ZETA + +0061 ; 0430 # ( a ~ а ) LATIN SMALL LETTER A ~ CYRILLIC SMALL LETTER A + +0063 ; 0441 # ( c ~ с ) LATIN SMALL LETTER C ~ CYRILLIC SMALL LETTER ES + +0064 ; 0501 # ( d ~ ԁ ) LATIN SMALL LETTER D ~ CYRILLIC SMALL LETTER KOMI DE + +0065 ; 0435 # ( e ~ е ) LATIN SMALL LETTER E ~ CYRILLIC SMALL LETTER IE + +0068 ; 04BB # ( h ~ һ ) LATIN SMALL LETTER H ~ CYRILLIC SMALL LETTER SHHA + +0069 ; 0456 # ( i ~ і ) LATIN SMALL LETTER I ~ CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I + +006A ; 03F3 # ( j ~ ϳ ) LATIN SMALL LETTER J ~ GREEK LETTER YOT + +006F ; 03BF # ( o ~ ο ) LATIN SMALL LETTER O ~ GREEK SMALL LETTER OMICRON + +0070 ; 0440 # ( p ~ р ) LATIN SMALL LETTER P ~ CYRILLIC SMALL LETTER ER + +0073 ; 0455 # ( s ~ ѕ ) LATIN SMALL LETTER S ~ CYRILLIC SMALL LETTER DZE + +0078 ; 0445 # ( x ~ х ) LATIN SMALL LETTER X ~ CYRILLIC SMALL LETTER HA + +0079 ; 0443 # ( y ~ у ) LATIN SMALL LETTER Y ~ CYRILLIC SMALL LETTER U + +00C6 ; 04D4 # ( Æ ~ Ӕ ) LATIN CAPITAL LETTER AE ~ CYRILLIC CAPITAL LIGATURE A IE + +00D0 ; 0110 # ( Ð ~ Đ ) LATIN CAPITAL LETTER ETH ~ LATIN CAPITAL LETTER D WITH STROKE + +00E6 ; 04D5 # ( æ ~ ӕ ) LATIN SMALL LETTER AE ~ CYRILLIC SMALL LIGATURE A IE + +0138 ; 043A # ( ĸ ~ к ) LATIN SMALL LETTER KRA ~ CYRILLIC SMALL LETTER KA + +0182 ; 0411 # ( Ƃ ~ Б ) LATIN CAPITAL LETTER B WITH TOPBAR ~ CYRILLIC CAPITAL LETTER BE + +018F ; 04D8 # ( Ə ~ Ә ) LATIN CAPITAL LETTER SCHWA ~ CYRILLIC CAPITAL LETTER SCHWA + +019F ; 04E8 # ( Ɵ ~ Ө ) LATIN CAPITAL LETTER O WITH MIDDLE TILDE ~ CYRILLIC CAPITAL LETTER BARRED O + +01A9 ; 03A3 # ( Ʃ ~ Σ ) LATIN CAPITAL LETTER ESH ~ GREEK CAPITAL LETTER SIGMA + +01DD ; 0259 # ( ǝ ~ ə ) LATIN SMALL LETTER TURNED E ~ LATIN SMALL LETTER SCHWA + +0245 ; 039B # ( Ʌ ~ Λ ) LATIN CAPITAL LETTER TURNED V ~ GREEK CAPITAL LETTER LAMDA + +0259 ; 04D9 # ( ə ~ ә ) LATIN SMALL LETTER SCHWA ~ CYRILLIC SMALL LETTER SCHWA + +025B ; 03B5 # ( ɛ ~ ε ) LATIN SMALL LETTER OPEN E ~ GREEK SMALL LETTER EPSILON + +0269 ; 03B9 # ( ɩ ~ ι ) LATIN SMALL LETTER IOTA ~ GREEK SMALL LETTER IOTA + +0275 ; 04E9 # ( ɵ ~ ө ) LATIN SMALL LETTER BARRED O ~ CYRILLIC SMALL LETTER BARRED O + +0292 ; 04E1 # ( ʒ ~ ӡ ) LATIN SMALL LETTER EZH ~ CYRILLIC SMALL LETTER ABKHASIAN DZE + +0299 ; 0432 # ( ʙ ~ в ) LATIN LETTER SMALL CAPITAL B ~ CYRILLIC SMALL LETTER VE + +029C ; 043D # ( ʜ ~ н ) LATIN LETTER SMALL CAPITAL H ~ CYRILLIC SMALL LETTER EN + +0393 ; 0413 # ( Γ ~ Г ) GREEK CAPITAL LETTER GAMMA ~ CYRILLIC CAPITAL LETTER GHE + +03A0 ; 041F # ( Π ~ П ) GREEK CAPITAL LETTER PI ~ CYRILLIC CAPITAL LETTER PE + +03B1 ; 237A # ( α ~ ⍺ ) GREEK SMALL LETTER ALPHA ~ APL FUNCTIONAL SYMBOL ALPHA + +03B9 ; 2373 # ( ι ~ ⍳ ) GREEK SMALL LETTER IOTA ~ APL FUNCTIONAL SYMBOL IOTA + +03C1 ; 2374 # ( ρ ~ ⍴ ) GREEK SMALL LETTER RHO ~ APL FUNCTIONAL SYMBOL RHO + +03C9 ; 2375 # ( ω ~ ⍵ ) GREEK SMALL LETTER OMEGA ~ APL FUNCTIONAL SYMBOL OMEGA + +0433 ; 1D26 # ( г ~ ᴦ ) CYRILLIC SMALL LETTER GHE ~ GREEK LETTER SMALL CAPITAL GAMMA + +043B ; 1D2B # ( л ~ ᴫ ) CYRILLIC SMALL LETTER EL ~ CYRILLIC LETTER SMALL CAPITAL EL + +043F ; 1D28 # ( п ~ ᴨ ) CYRILLIC SMALL LETTER PE ~ GREEK LETTER SMALL CAPITAL PI + +101D ; 1040 # ( ဝ ~ ၀ ) MYANMAR LETTER WA ~ MYANMAR DIGIT ZERO + +17A2 ; 17A3 # ( អ ~ ឣ ) KHMER LETTER QA ~ KHMER INDEPENDENT VOWEL QAQ + +1835 ; 1855 # ( ᠵ ~ ᡕ ) MONGOLIAN LETTER JA ~ MONGOLIAN LETTER TODO YA + +199E ; 19D0 # ( ᦞ ~ ᧐ ) NEW TAI LUE LETTER LOW VA ~ NEW TAI LUE DIGIT ZERO + +19B1 ; 19D1 # ( ᦱ ~ ᧑ ) NEW TAI LUE VOWEL SIGN AA ~ NEW TAI LUE DIGIT ONE + +1A45 ; 1A80 # ( ᩅ ~ ᪀ ) TAI THAM LETTER WA ~ TAI THAM HORA DIGIT ZERO +1A45 ; 1A90 # ( ᩅ ~ ᪐ ) TAI THAM LETTER WA ~ TAI THAM THAM DIGIT ZERO + +1B0D ; 1B52 # ( ᬍ ~ ᭒ ) BALINESE LETTER LA LENGA ~ BALINESE DIGIT TWO + +1B11 ; 1B53 # ( ᬑ ~ ᭓ ) BALINESE LETTER OKARA ~ BALINESE DIGIT THREE + +1B28 ; 1B58 # ( ᬨ ~ ᭘ ) BALINESE LETTER PA KAPAL ~ BALINESE DIGIT EIGHT + +1B50 ; 1B5C # ( ᭐ ~ ᭜ ) BALINESE DIGIT ZERO ~ BALINESE WINDU + +1D0D ; 043C # ( ᴍ ~ м ) LATIN LETTER SMALL CAPITAL M ~ CYRILLIC SMALL LETTER EM + +1D18 ; 1D29 # ( ᴘ ~ ᴩ ) LATIN LETTER SMALL CAPITAL P ~ GREEK LETTER SMALL CAPITAL RHO + +1D1B ; 0442 # ( ᴛ ~ т ) LATIN LETTER SMALL CAPITAL T ~ CYRILLIC SMALL LETTER TE + +2C67 ; 04A2 # ( Ⱨ ~ Ң ) LATIN CAPITAL LETTER H WITH DESCENDER ~ CYRILLIC CAPITAL LETTER EN WITH DESCENDER + +2C69 ; 049A # ( Ⱪ ~ Қ ) LATIN CAPITAL LETTER K WITH DESCENDER ~ CYRILLIC CAPITAL LETTER KA WITH DESCENDER + +A9D0 ; A9C6 # ( ꧐ ~ ꧆ ) JAVANESE DIGIT ZERO ~ JAVANESE PADA WINDU + +10382 ; 103D1 # ( 𐎂 ~ 𐏑 ) UGARITIC LETTER GAMLA ~ OLD PERSIAN NUMBER ONE + +10393 ; 103D3 # ( 𐎓 ~ 𐏓 ) UGARITIC LETTER AIN ~ OLD PERSIAN NUMBER TEN + +1039A ; 12038 # ( 𐎚 ~ 𒀸 ) UGARITIC LETTER TO ~ CUNEIFORM SIGN ASH + +10486 ; 104A0 # ( 𐒆 ~ 𐒠 ) OSMANYA LETTER DEEL ~ OSMANYA DIGIT ZERO + From 83e0221912f86525f5e508c3af71caf09bc95e96 Mon Sep 17 00:00:00 2001 From: Har1sh-k Date: Fri, 14 Nov 2025 12:58:47 -0600 Subject: [PATCH 03/10] update doc for badchars --- docs/source/garak.probes.badchars.rst | 9 +++++++++ docs/source/probes.rst | 1 + 2 files changed, 10 insertions(+) create mode 100644 docs/source/garak.probes.badchars.rst diff --git a/docs/source/garak.probes.badchars.rst b/docs/source/garak.probes.badchars.rst new file mode 100644 index 000000000..0a4da8a67 --- /dev/null +++ b/docs/source/garak.probes.badchars.rst @@ -0,0 +1,9 @@ +garak.probes.badchars +===================== + +.. automodule:: garak.probes.badchars + :members: + :undoc-members: + :show-inheritance: + + .. show-asr:: diff --git a/docs/source/probes.rst b/docs/source/probes.rst index 01bf24c00..a26e0e22b 100644 --- a/docs/source/probes.rst +++ b/docs/source/probes.rst @@ -46,4 +46,5 @@ For a detailed oversight into how a probe operates, see :doc:`garak.probes.base` garak.probes.topic garak.probes.visual_jailbreak garak.probes.web_injection + garak.probes.badchars garak.probes._tier From 1e62f8e3e48c4b0246a49a16c02fe16e84f9cf7b Mon Sep 17 00:00:00 2001 From: Har1sh-k Date: Fri, 14 Nov 2025 12:59:14 -0600 Subject: [PATCH 04/10] add bad char in readme --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index d6e3dc174..0c9969dcc 100644 --- a/README.md +++ b/README.md @@ -220,6 +220,7 @@ For testing. This generator repeats back the prompt it received. |----------------------|-------------------------------------------------------------------------------------------------------------------------------| | blank | A simple probe that always sends an empty prompt. | | atkgen | Automated Attack Generation. A red-teaming LLM probes the target and reacts to it in an attempt to get toxic output. Prototype, mostly stateless, for now uses a simple GPT-2 [fine-tuned](https://huggingface.co/garak-llm/artgpt2tox) on the subset of hhrlhf attempts that yielded detectable toxicity (the only target currently supported for now). | +| badchars | Implements imperceptible Unicode perturbations (invisible characters, homoglyphs, reorderings, deletions) inspired by the [Bad Characters](https://arxiv.org/abs/2106.09898) paper. | | av_spam_scanning | Probes that attempt to make the model output malicious content signatures | | continuation | Probes that test if the model will continue a probably undesirable word | | dan | Various [DAN](https://adguard.com/en/blog/chatgpt-dan-prompt-abuse.html) and DAN-like attacks | From a4c8c84124eb80cffd4ff461b9818e5376eb04a0 Mon Sep 17 00:00:00 2001 From: Har1sh-k Date: Fri, 14 Nov 2025 13:10:32 -0600 Subject: [PATCH 05/10] probe: add tests for bad char --- tests/probes/test_probes_badcharacters.py | 94 +++++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 tests/probes/test_probes_badcharacters.py diff --git a/tests/probes/test_probes_badcharacters.py b/tests/probes/test_probes_badcharacters.py new file mode 100644 index 000000000..4fcb25a88 --- /dev/null +++ b/tests/probes/test_probes_badcharacters.py @@ -0,0 +1,94 @@ +from garak import _config, _plugins +from garak.probes.badchars import DEFAULT_INVISIBLE, BIDI_CONTROLS + + +PLUGIN_NAME = "probes.badchars.BadCharacters" + + +def _load_badcharacters(custom_config: dict | None = None): + cfg = {"probes": {"badchars": {"BadCharacters": custom_config or {}}}} + return _plugins.load_plugin(PLUGIN_NAME, config_root=cfg) + + +def test_badcharacters_generates_all_categories(): + original_cap = _config.run.soft_probe_prompt_cap + _config.run.soft_probe_prompt_cap = None + try: + probe = _load_badcharacters( + { + "perturbation_budget": 1, + "max_position_candidates": 2, + "max_reorder_candidates": 2, + "max_ascii_variants": 4, + "follow_prompt_cap": False, + } + ) + finally: + _config.run.soft_probe_prompt_cap = original_cap + + categories = {conv.notes.get("bad_character_category") for conv in probe.prompts} + assert {"invisible", "homoglyph", "reordering", "deletion"}.issubset(categories) + + for conv in probe.prompts: + assert isinstance(conv.notes.get("operations"), dict) + assert conv.notes.get("source_payload_index") is not None + assert conv.notes.get("source_payload_excerpt") + + +def test_badcharacters_respects_prompt_cap(): + original_cap = _config.run.soft_probe_prompt_cap + _config.run.soft_probe_prompt_cap = 5 + try: + probe = _load_badcharacters( + { + "perturbation_budget": 1, + "max_position_candidates": 2, + "max_reorder_candidates": 2, + "max_ascii_variants": 3, + } + ) + finally: + _config.run.soft_probe_prompt_cap = original_cap + + assert len(probe.prompts) <= 5 + + categories = {conv.notes.get("bad_character_category") for conv in probe.prompts} + assert categories # non-empty after downsampling + + +def test_badcharacters_inserts_expected_characters(): + probe = _load_badcharacters( + { + "perturbation_budget": 1, + "max_position_candidates": 2, + "max_reorder_candidates": 2, + "max_ascii_variants": 2, + "follow_prompt_cap": False, + } + ) + + samples = {} + for conv in probe.prompts: + category = conv.notes.get("bad_character_category") + samples.setdefault(category, conv) + + assert {"invisible", "homoglyph", "reordering", "deletion"}.issubset(samples.keys()) + + invisible_text = samples["invisible"].turns[0].content.text + assert any(ch in DEFAULT_INVISIBLE for ch in invisible_text) + + homoglyph_conv = samples["homoglyph"] + homoglyph_text = homoglyph_conv.turns[0].content.text + replacements = homoglyph_conv.notes["operations"]["replacements"] + for replacement in replacements: + assert replacement in homoglyph_text + + reorder_text = samples["reordering"].turns[0].content.text + assert any(ctrl in reorder_text for ctrl in BIDI_CONTROLS.values()) + + deletion_conv = samples["deletion"] + deletion_text = deletion_conv.turns[0].content.text + ascii_codes = deletion_conv.notes["operations"]["ascii_codes"] + for code in ascii_codes: + seq = f"{chr(code)}\b" + assert seq in deletion_text From 8ddb5054b44a1f314112ab1e0597d5723e24ba75 Mon Sep 17 00:00:00 2001 From: Har1sh-k Date: Mon, 17 Nov 2025 11:12:35 -0600 Subject: [PATCH 06/10] probe: bad characters formatting --- garak/probes/badchars.py | 37 +++++++++++-------------------------- 1 file changed, 11 insertions(+), 26 deletions(-) diff --git a/garak/probes/badchars.py b/garak/probes/badchars.py index 4bc414ccf..f53d0bf8b 100644 --- a/garak/probes/badchars.py +++ b/garak/probes/badchars.py @@ -1,6 +1,3 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. -# SPDX-License-Identifier: Apache-2.0 - """Imperceptible perturbation probes inspired by the Bad Characters paper. Implements the "just try everything" strategy for invisible Unicode characters, @@ -98,7 +95,7 @@ def _load_homoglyph_map() -> dict[str, List[str]]: except FileNotFoundError as exc: msg = ( "Unable to load intentional.txt for homoglyph perturbations. " - "Run `git lfs pull` or reinstall data assets." + "Get data from - https://www.unicode.org/Public/security/latest/intentional.txt" ) logging.error(msg) raise PluginConfigurationError(msg) from exc @@ -148,12 +145,12 @@ def __init__(self, config_root=_config): if isinstance(categories, str): categories = [categories] self._enabled_categories = { - cat.lower() - for cat in categories - if isinstance(cat, str) + cat.lower() for cat in categories if isinstance(cat, str) } if not self._enabled_categories: - raise PluginConfigurationError("enabled_categories must include at least one entry") + raise PluginConfigurationError( + "enabled_categories must include at least one entry" + ) payload_group = garak.payloads.load(self.payload_name) self._source_payloads = payload_group.payloads @@ -216,9 +213,7 @@ def _generate_variants( } yield variant_text, metadata - def _generate_invisible_variants( - self, payload: str - ) -> Iterator[Tuple[str, dict]]: + def _generate_invisible_variants(self, payload: str) -> Iterator[Tuple[str, dict]]: positions = self._select_positions(len(payload), self.max_position_candidates) for count in range(1, self.perturbation_budget + 1): if len(positions) < count: @@ -234,9 +229,7 @@ def _generate_invisible_variants( }, } - def _generate_homoglyph_variants( - self, payload: str - ) -> Iterator[Tuple[str, dict]]: + def _generate_homoglyph_variants(self, payload: str) -> Iterator[Tuple[str, dict]]: candidate_positions = [ idx for idx, ch in enumerate(payload) if ch in self._homoglyph_map ] @@ -262,17 +255,13 @@ def _generate_homoglyph_variants( }, } - def _generate_reordering_variants( - self, payload: str - ) -> Iterator[Tuple[str, dict]]: + def _generate_reordering_variants(self, payload: str) -> Iterator[Tuple[str, dict]]: if len(payload) < 2: return candidates = self._select_positions( len(payload) - 1, self.max_reorder_candidates, include_endpoint=False ) - valid_indices = [ - idx for idx in candidates if idx < len(payload) - 1 - ] + valid_indices = [idx for idx in candidates if idx < len(payload) - 1] for count in range(1, min(self.perturbation_budget, len(valid_indices)) + 1): for combo in itertools.combinations(valid_indices, count): if not self._non_overlapping(combo): @@ -283,9 +272,7 @@ def _generate_reordering_variants( "operations": {"positions": list(combo)}, } - def _generate_deletion_variants( - self, payload: str - ) -> Iterator[Tuple[str, dict]]: + def _generate_deletion_variants(self, payload: str) -> Iterator[Tuple[str, dict]]: positions = self._select_positions(len(payload), self.max_position_candidates) ascii_candidates = self._select_ascii(self.max_ascii_variants) for count in range(1, self.perturbation_budget + 1): @@ -305,9 +292,7 @@ def _generate_deletion_variants( }, } - def _inject_sequences( - self, payload: str, insertions: List[Tuple[int, str]] - ) -> str: + def _inject_sequences(self, payload: str, insertions: List[Tuple[int, str]]) -> str: result = payload offset = 0 for position, value in sorted(insertions, key=lambda item: item[0]): From fd94ed374ca8225e7760d1688f934988a8375c61 Mon Sep 17 00:00:00 2001 From: Har1sh-k Date: Tue, 18 Nov 2025 10:06:26 -0600 Subject: [PATCH 07/10] fix probe translation test for conversation prompts and expand probe docstring --- garak/probes/badchars.py | 7 ++++++- tests/langservice/probes/test_probes_base.py | 17 +++++++++++++++-- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/garak/probes/badchars.py b/garak/probes/badchars.py index f53d0bf8b..76766d8f8 100644 --- a/garak/probes/badchars.py +++ b/garak/probes/badchars.py @@ -104,7 +104,12 @@ def _load_homoglyph_map() -> dict[str, List[str]]: class BadCharacters(garak.probes.Probe): - """Imperceptible perturbations mixing invisible chars, homoglyphs, swaps, deletions.""" + """Imperceptible perturbations mixing invisible chars, homoglyphs, swaps, deletions. + + Implements the paper's full spectrum of adversarial perturbations by generating prompt variants that insert invisible Unicode, + swap characters with bidi tricks, substitute homoglyphs, and inject deletions/backspaces. + The goal is to stress refusal policies with subtle transformations that should remain semantically equivalent to the original payloads. + """ lang = "en" active = False diff --git a/tests/langservice/probes/test_probes_base.py b/tests/langservice/probes/test_probes_base.py index 3ea1d88ac..243e82d0e 100644 --- a/tests/langservice/probes/test_probes_base.py +++ b/tests/langservice/probes/test_probes_base.py @@ -7,7 +7,7 @@ import os from garak import _config, _plugins -from garak.attempt import Message, Attempt +from garak.attempt import Message, Attempt, Conversation NON_PROMPT_PROBES = [ @@ -289,7 +289,20 @@ def test_probe_prompt_translation(classname, mocker): probe_instance.probe(generator_instance) - expected_provision_calls = len(probe_instance.prompts) + 1 + prompts = probe_instance.prompts or [] + forward_translation_calls = 0 + if prompts: + if isinstance(prompts[0], str): + forward_translation_calls = 1 + else: + # Conversation prompts trigger a translation per turn, while message prompts translate once per prompt. + for prompt in prompts: + if isinstance(prompt, Conversation): + forward_translation_calls += len(prompt.turns) + elif isinstance(prompt, Message): + forward_translation_calls += 1 + + expected_provision_calls = len(prompts) + forward_translation_calls if hasattr(probe_instance, "triggers"): # increase prompt calls by 1 or if triggers are lists by the len of triggers if isinstance(probe_instance.triggers[0], list): From b9e166dc54633ebff36d2d54ff50b0ac6dad9cfe Mon Sep 17 00:00:00 2001 From: Har1sh-k Date: Wed, 26 Nov 2025 11:47:55 -0600 Subject: [PATCH 08/10] document bidi swap expansion ordering --- garak/probes/badchars.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/garak/probes/badchars.py b/garak/probes/badchars.py index 76766d8f8..79cb14e08 100644 --- a/garak/probes/badchars.py +++ b/garak/probes/badchars.py @@ -40,7 +40,13 @@ class _Swap: def _render_swaps(elements: Sequence) -> str: - """Recursively expand swap objects into bidi control sequences.""" + """Recursively expand swap objects into bidi control sequences. + + The sequence mirrors the bidi swap function from Boucher et al. + ("Bad Characters," arXiv:2106.09898) and the imperceptible reference + implementation: it forces two adjacent code points to render in reverse + order while containing all directionality side effects. + """ rendered: List[str] = [] for element in elements: From 07fbd25ac7524101d6e0a228739e83406a5fc1b2 Mon Sep 17 00:00:00 2001 From: Har1sh-k Date: Thu, 4 Dec 2025 18:39:14 -0600 Subject: [PATCH 09/10] document downsampling behavior --- garak/probes/badchars.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/garak/probes/badchars.py b/garak/probes/badchars.py index 79cb14e08..c47c078cc 100644 --- a/garak/probes/badchars.py +++ b/garak/probes/badchars.py @@ -378,6 +378,10 @@ def _excerpt(payload: str, limit: int = 96) -> str: return f"{flattened[: limit - 1]}…" def _downsample_prompts(self) -> None: + """Downsample prompts while keeping category balance and seedable shuffling. + + Differs from Probe._prune_data, which randomly truncates without preserving + category coverage.""" if not self.prompts: return cap = self.soft_probe_prompt_cap From a1d6fa88154efaca417a8f4c41adbcfa1e41d539 Mon Sep 17 00:00:00 2001 From: Har1sh-k Date: Thu, 4 Dec 2025 18:55:46 -0600 Subject: [PATCH 10/10] validate BadChar categories upfront --- garak/probes/badchars.py | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/garak/probes/badchars.py b/garak/probes/badchars.py index c47c078cc..ce4c6f28b 100644 --- a/garak/probes/badchars.py +++ b/garak/probes/badchars.py @@ -163,6 +163,26 @@ def __init__(self, config_root=_config): "enabled_categories must include at least one entry" ) + self._generators = { + "invisible": self._generate_invisible_variants, + "homoglyph": self._generate_homoglyph_variants, + "reordering": self._generate_reordering_variants, + "deletion": self._generate_deletion_variants, + } + supported_categories = set(self._generators) + unknown_categories = self._enabled_categories - supported_categories + if unknown_categories: + logging.warning( + "Unknown BadCharacters categories %s; skipping", + sorted(unknown_categories), + ) + self._enabled_categories &= supported_categories + if not self._enabled_categories: + raise PluginConfigurationError( + "enabled_categories must include at least one recognized entry " + f"{sorted(supported_categories)}" + ) + payload_group = garak.payloads.load(self.payload_name) self._source_payloads = payload_group.payloads self.prompts: List[garak.attempt.Conversation] = [] @@ -203,16 +223,9 @@ def _append_prompt(self, text: str, metadata: dict) -> None: def _generate_variants( self, payload: str, payload_idx: int ) -> Iterator[Tuple[str, dict]]: - generators = { - "invisible": self._generate_invisible_variants, - "homoglyph": self._generate_homoglyph_variants, - "reordering": self._generate_reordering_variants, - "deletion": self._generate_deletion_variants, - } for category in self._enabled_categories: - generator = generators.get(category) + generator = self._generators.get(category) if generator is None: - logging.warning("Unknown BadCharacters category '%s'", category) continue for variant_text, details in generator(payload): metadata = {