release: v0.3.0

justinwangx · justinwangx · commit 33d84187f5ad · 2025-02-09T22:19:23.000Z
diff --git a/README.md b/README.md
@@ -26,7 +26,7 @@ pip install -e .
 
 ## Overview
 
-The GCG algorithm was introduced in [Universal and Transferrable Attacks on Aligned Language Models](https://arxiv.org/pdf/2307.15043) [1] by Andy Zou, Zifan Wang, Nicholas Carlini, Milad Nasr, Zico Kolter, and Matt Fredrikson. This implementation implements the original algorithm and supports several modifications that can improve performance, including multi-position token swapping [2], a historical attack buffer [2][3], the mellowmax loss function [4][5], and probe sampling [6].
+The GCG algorithm was introduced in [Universal and Transferrable Attacks on Aligned Language Models](https://arxiv.org/pdf/2307.15043) [1] by Andy Zou, Zifan Wang, Nicholas Carlini, Milad Nasr, Zico Kolter, and Matt Fredrikson. nanoGCG implements the original algorithm and supports several modifications that can improve performance, including multi-position token swapping [2], a historical attack buffer [2][3], the mellowmax loss function [4][5], and probe sampling [6].
 
 ## Usage
 
@@ -93,7 +93,7 @@ The parameters that can be configured and their defaults are:
 
 - `verbosity: str = "INFO"` - the reported logging error level (e.g. "ERROR", "WARNING", "INFO")
 
-- `probe_sampling_config: ProbeSamplingConfig = None` - A collection of configuratble parameters for probe sampling. See the example below.
+- `probe_sampling_config: ProbeSamplingConfig = None` - A collection of configurable parameters for probe sampling. See the example below.
 
 Note that the default nanoGCG configuration will run the GCG algorithm as described in the [original paper](https://arxiv.org/pdf/2307.15043) without algorithmic changes like multi-position token swapping and mellowmax.
 
@@ -136,13 +136,13 @@ You can enable probe sampling by specifying the `probe_sampling_config` with app
 import nanogcg
 import torch
 
-from nanogcg import GCGConfig
+from nanogcg import GCGConfig, ProbeSamplingConfig
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 draft_model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2", torch_dtype=torch.bfloat16).to("cuda")
 draft_tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
 
-probe_sampling_config = nanogcg.gcg.ProbeSamplingConfig(
+probe_sampling_config = ProbeSamplingConfig(
     draft_model=draft_model,
     draft_tokenizer=draft_tokenizer,
     r=64,
diff --git a/examples/simple.py b/examples/simple.py
@@ -8,7 +8,7 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 import nanogcg
-from nanogcg.gcg import ProbeSamplingConfig
+from nanogcg import GCGConfig, ProbeSamplingConfig
 
 
 def parse_args() -> argparse.Namespace:
@@ -40,7 +40,7 @@ def main():
 
     messages = [{"role": "user", "content": args.prompt}]
 
-    config = nanogcg.GCGConfig(
+    config = GCGConfig(
         verbosity="DEBUG",
         probe_sampling_config=probe_sampling_config,
     )
diff --git a/nanogcg/__init__.py b/nanogcg/__init__.py
@@ -15,4 +15,4 @@
 For more detailed information, see the GitHub repository: https://github.com/GraySwanAI/nanoGCG/tree/main
 """
 
-from .gcg import GCGConfig, run
+from .gcg import GCGConfig, ProbeSamplingConfig, run
diff --git a/nanogcg/gcg.py b/nanogcg/gcg.py
@@ -281,12 +281,8 @@ def run(
 
             # Tokenize everything that doesn't get optimized for the draft model
             draft_before_ids = self.draft_tokenizer([before_str], padding=False, return_tensors="pt")["input_ids"].to(model.device, torch.int64)
-            draft_after_ids = self.draft_tokenizer([after_str], add_special_tokens=False, return_tensors="pt")["input_ids"].to(
-                model.device, torch.int64
-            )
-            self.draft_target_ids = self.draft_tokenizer([target], add_special_tokens=False, return_tensors="pt")["input_ids"].to(
-                model.device, torch.int64
-            )
+            draft_after_ids = self.draft_tokenizer([after_str], add_special_tokens=False, return_tensors="pt")["input_ids"].to(model.device, torch.int64)
+            self.draft_target_ids = self.draft_tokenizer([target], add_special_tokens=False, return_tensors="pt")["input_ids"].to(model.device, torch.int64)
 
             (
                 self.draft_before_embeds,
@@ -356,7 +352,7 @@ def run(
                     optim_ids = sampled_ids[loss.argmin()].unsqueeze(0)
                 else:
                     current_loss, optim_ids = find_executable_batch_size(self._compute_candidates_loss_probe_sampling, batch_size)(
-                        input_embeds, sampled_ids
+                        input_embeds, sampled_ids,
                     )
 
                 # Update the buffer based on the loss
@@ -498,6 +494,60 @@ def compute_token_gradient(
 
         return optim_ids_onehot_grad
 
+    def _compute_candidates_loss_original(
+        self,
+        search_batch_size: int,
+        input_embeds: Tensor,
+    ) -> Tensor:
+        """Computes the GCG loss on all candidate token id sequences.
+
+        Args:
+            search_batch_size : int
+                the number of candidate sequences to evaluate in a given batch
+            input_embeds : Tensor, shape = (search_width, seq_len, embd_dim)
+                the embeddings of the `search_width` candidate sequences to evaluate
+        """
+        all_loss = []
+        prefix_cache_batch = []
+
+        for i in range(0, input_embeds.shape[0], search_batch_size):
+            with torch.no_grad():
+                input_embeds_batch = input_embeds[i:i + search_batch_size]
+                current_batch_size = input_embeds_batch.shape[0]
+
+                if self.prefix_cache:
+                    if not prefix_cache_batch or current_batch_size != search_batch_size:
+                        prefix_cache_batch = [[x.expand(current_batch_size, -1, -1, -1) for x in self.prefix_cache[i]] for i in range(len(self.prefix_cache))]
+
+                    outputs = self.model(inputs_embeds=input_embeds_batch, past_key_values=prefix_cache_batch, use_cache=True)
+                else:
+                    outputs = self.model(inputs_embeds=input_embeds_batch)
+
+                logits = outputs.logits
+
+                tmp = input_embeds.shape[1] - self.target_ids.shape[1]
+                shift_logits = logits[..., tmp-1:-1, :].contiguous()
+                shift_labels = self.target_ids.repeat(current_batch_size, 1)
+
+                if self.config.use_mellowmax:
+                    label_logits = torch.gather(shift_logits, -1, shift_labels.unsqueeze(-1)).squeeze(-1)
+                    loss = mellowmax(-label_logits, alpha=self.config.mellowmax_alpha, dim=-1)
+                else:
+                    loss = torch.nn.functional.cross_entropy(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1), reduction="none")
+
+                loss = loss.view(current_batch_size, -1).mean(dim=-1)
+                all_loss.append(loss)
+
+                if self.config.early_stop:
+                    if torch.any(torch.all(torch.argmax(shift_logits, dim=-1) == shift_labels, dim=-1)).item():
+                        self.stop_flag = True
+
+                del outputs
+                gc.collect()
+                torch.cuda.empty_cache()
+
+        return torch.cat(all_loss, dim=0)
+
     def _compute_candidates_loss_probe_sampling(
         self,
         search_batch_size: int,
@@ -671,60 +721,6 @@ def _convert_to_draft_tokens(token_ids: Tensor) -> Tensor:
             )
         )
 
-    def _compute_candidates_loss_original(
-        self,
-        search_batch_size: int,
-        input_embeds: Tensor,
-    ) -> Tensor:
-        """Computes the GCG loss on all candidate token id sequences.
-
-        Args:
-            search_batch_size : int
-                the number of candidate sequences to evaluate in a given batch
-            input_embeds : Tensor, shape = (search_width, seq_len, embd_dim)
-                the embeddings of the `search_width` candidate sequences to evaluate
-        """
-        all_loss = []
-        prefix_cache_batch = []
-
-        for i in range(0, input_embeds.shape[0], search_batch_size):
-            with torch.no_grad():
-                input_embeds_batch = input_embeds[i:i + search_batch_size]
-                current_batch_size = input_embeds_batch.shape[0]
-
-                if self.prefix_cache:
-                    if not prefix_cache_batch or current_batch_size != search_batch_size:
-                        prefix_cache_batch = [[x.expand(current_batch_size, -1, -1, -1) for x in self.prefix_cache[i]] for i in range(len(self.prefix_cache))]
-
-                    outputs = self.model(inputs_embeds=input_embeds_batch, past_key_values=prefix_cache_batch, use_cache=True)
-                else:
-                    outputs = self.model(inputs_embeds=input_embeds_batch)
-
-                logits = outputs.logits
-
-                tmp = input_embeds.shape[1] - self.target_ids.shape[1]
-                shift_logits = logits[..., tmp-1:-1, :].contiguous()
-                shift_labels = self.target_ids.repeat(current_batch_size, 1)
-
-                if self.config.use_mellowmax:
-                    label_logits = torch.gather(shift_logits, -1, shift_labels.unsqueeze(-1)).squeeze(-1)
-                    loss = mellowmax(-label_logits, alpha=self.config.mellowmax_alpha, dim=-1)
-                else:
-                    loss = torch.nn.functional.cross_entropy(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1), reduction="none")
-
-                loss = loss.view(current_batch_size, -1).mean(dim=-1)
-                all_loss.append(loss)
-
-                if self.config.early_stop:
-                    if torch.any(torch.all(torch.argmax(shift_logits, dim=-1) == shift_labels, dim=-1)).item():
-                        self.stop_flag = True
-
-                del outputs
-                gc.collect()
-                torch.cuda.empty_cache()
-
-        return torch.cat(all_loss, dim=0)
-
 
 # A wrapper around the GCG `run` method that provides a simple API
 def run(
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "nanogcg"
-version = "0.2.3"
+version = "0.3.0"
 authors = [
     { name="Justin Wang", email="justin@grayswan.ai" },
 ]

Original file line number	Diff line number	Diff line change
`@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"`
`4`	`4`
`5`	`5`	`[project]`
`6`	`6`	`name = "nanogcg"`
`7`		`-version = "0.2.3"`
	`7`	`+version = "0.3.0"`
`8`	`8`	`authors = [`
`9`	`9`	`{ name="Justin Wang", email="[email protected]" },`
`10`	`10`	`]`