huggingface · gante · Apr 8, 2025 · Mar 13, 2025 · Mar 17, 2025 · Mar 24, 2025
diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
@@ -19,7 +19,9 @@
 
 import numpy as np
 import torch
+import torch.nn as nn
 
+from ..pytorch_utils import prune_linear_layer
 from ..utils import is_sklearn_available
 
 
@@ -612,6 +614,63 @@ def _process_assistant_outputs(
         return new_target_ids
 
 
+class PruneReindexingLMHead(nn.Module):
+    """
+    A class to prune and reindex the language model head.
+
+    This class prunes the language model head to only include the specified token IDs and reindexes the logits
+    to map back to the original vocabulary.
+
+    Args:
+        original_lm_head (nn.Module): The original language model head.
+        token_ids (list[int]): The list of token IDs to keep.
+    """
+
+    def __init__(self, original_lm_head, assistant_overlap_token_ids):
+        super().__init__()
+        self.pruned_lm_head = prune_linear_layer(original_lm_head, assistant_overlap_token_ids).to(
+            original_lm_head.weight.dtype
+        )
+
+    def forward(self, hidden_states):
+        pruned_logits = self.pruned_lm_head(hidden_states)
+        return pruned_logits
+
+
+class MapInputEmbedding(nn.Module):
+    def __init__(self, original_embedding: nn.Embedding, assistant_overlap_token_ids):
+        """
+        Wraps an existing embedding layer and remaps token IDs before lookup.
+
+        Args:
+            original_embedding (nn.Embedding): Pre-trained or existing embedding layer.
+            assistant_overlap_token_ids (dict): Mapping from original token IDs to new token IDs.
+                          Example: {old_id: new_id}
+        """
+        super().__init__()
+        self.original_embedding = original_embedding
+        self.weight = original_embedding.weight
+        self.assistant_overlap_token_ids = assistant_overlap_token_ids
+        self.map = False
+
+    def forward(self, input_ids: torch.LongTensor) -> torch.FloatTensor:
+        """
+        Args:
+            input_ids (torch.LongTensor): Tensor of token IDs (batch_size, seq_len).
+
+        Returns:
+            torch.FloatTensor: Corresponding input embeddings.
+        """
+        if self.map:
+            # Get the last item from input_ids
+            my_input_ids = self.assistant_overlap_token_ids[input_ids[0, -1]].unsqueeze(0).unsqueeze(0)
+        else:
+            self.map = True
+            my_input_ids = input_ids
+
+        return self.original_embedding(my_input_ids)
+
+
 class AssistantToTargetTranslator:
     """
     Translates token ids and logits between assistant and target model vocabularies. This class is used to handle
@@ -638,23 +697,46 @@ def __init__(
         self,
         target_tokenizer: "PreTrainedTokenizerBase",
         assistant_tokenizer: "PreTrainedTokenizerBase",
-        target_vocab_size: int,  # required since target_vocab_size can be different from the length of target_tokenizer.get_vocab()
-        assistant_model_device: str = "cpu",
+        assistant_model: "PreTrainedModel",
+        target_vocab_size: Optional[int],
+        assistant_prune_LM_head: Optional[bool] = True,
     ):
         self._target_tokenizer: "PreTrainedTokenizerBase" = target_tokenizer
         self._assistant_tokenizer: "PreTrainedTokenizerBase" = assistant_tokenizer
-        self._assistant_model_device: str = assistant_model_device
+        self._assistant_model_device: str = assistant_model.device
         self.target_vocab_size: int = target_vocab_size
         self._assistant_to_target_input_ids, self.target_to_assistant_input_ids = (
             self._get_assistant_to_target_input_ids()
         )
         self._suppress_input_ids: list[int] = self._get_suppress_input_ids()
         self.logits_processors: Optional[LogitsProcessorList] = None
+        self.assistant_prune_LM_head = assistant_prune_LM_head
-        self.assistant_prune_LM_head = assistant_prune_LM_head
+        self.assistant_prune_lm_head = assistant_prune_lm_head
-        self.assistant_prune_LM_head = assistant_prune_LM_head
+        self.assistant_prune_lm_head = assistant_prune_lm_head
         if len(self._suppress_input_ids) > 0:
-            # len(self._suppress_input_ids) = 0 if the assistant vocab is a subset of the target vocab
-            self.logits_processors = LogitsProcessorList(
-                [SuppressTokensLogitsProcessor(self._get_suppress_input_ids(), self._assistant_model_device)]
-            )
+            # the assistant vocab is not a subset of the target vocab
+            if self.assistant_prune_LM_head:
+                self.assistant_overlap_token_ids = torch.tensor(
+                    list(self.target_to_assistant_input_ids.values()),
+                    dtype=torch.long,
+                    device=self._assistant_model_device,
+                )
+                original_lm_head = assistant_model.get_output_embeddings()
+                pruned_lm_head = PruneReindexingLMHead(original_lm_head, self.assistant_overlap_token_ids)
+                del original_lm_head
+                assistant_model.set_output_embeddings(pruned_lm_head)
+
+                original_input_embeddings = assistant_model.get_input_embeddings()
+                map_input_embeddings = MapInputEmbedding(original_input_embeddings, self.assistant_overlap_token_ids)
+                del original_input_embeddings
+                assistant_model.set_input_embeddings(map_input_embeddings)
+                self.map_input_embeddings = map_input_embeddings
+            else:
+                self.logits_processors = LogitsProcessorList(
+                    [SuppressTokensLogitsProcessor(self._get_suppress_input_ids(), self._assistant_model_device)]
+                )
+
+    def set_unmap(self):
+        if self.assistant_prune_LM_head:
+            self.map_input_embeddings.map = False
 
     def _get_assistant_to_target_input_ids(self):
         target_vocab = self._target_tokenizer.get_vocab()
@@ -710,7 +792,12 @@ def get_target_ids(
         if num_new_tokens == 0:
             return target_input_ids
         else:
-            transformed_slice = self._assistant_to_target_input_ids[assistant_candidate_ids[0, -num_new_tokens:]]
+            # Get last `num_new_tokens` candidate IDs
+            last_candidate_ids = assistant_candidate_ids[0, -num_new_tokens:]
+            if self.assistant_prune_LM_head:
+                # Map assistant IDs -> target input IDs
+                last_candidate_ids = self.assistant_overlap_token_ids[last_candidate_ids]
+            transformed_slice = self._assistant_to_target_input_ids[last_candidate_ids]
             return torch.cat((target_input_ids, transformed_slice.unsqueeze(0)), dim=1)
 
     def get_target_logits(self, assistant_logits: torch.FloatTensor) -> torch.FloatTensor:
@@ -724,10 +811,12 @@ def get_target_logits(self, assistant_logits: torch.FloatTensor) -> torch.FloatT
         assistant_indices_mask = self._assistant_to_target_input_ids != self.SUPPRESS_TOKEN_ID
         # Exclude invalid indices
         target_logits_supported_indices = self._assistant_to_target_input_ids[assistant_indices_mask]
-        valid_assistant_logits = assistant_logits[..., : self._assistant_to_target_input_ids.shape[0]]
-
-        target_logits[..., target_logits_supported_indices] = valid_assistant_logits[..., assistant_indices_mask]
 
+        if self.assistant_prune_LM_head:
+            target_logits[..., target_logits_supported_indices] = assistant_logits
+        else:
+            valid_assistant_logits = assistant_logits[..., : self._assistant_to_target_input_ids.shape[0]]
+            target_logits[..., target_logits_supported_indices] = valid_assistant_logits[..., assistant_indices_mask]
         return target_logits
 
 
@@ -744,8 +833,9 @@ def get_translator(
         cls,
         target_tokenizer: "PreTrainedTokenizerBase",
         assistant_tokenizer: "PreTrainedTokenizerBase",
-        target_vocab_size: int,
-        assistant_model_device: str = "cpu",
+        assistant_model: "PreTrainedModel",
+        target_vocab_size: Optional[int] = None,
+        assistant_prune_LM_head=True,
     ) -> AssistantToTargetTranslator:
         assistant_dict = cls._cache.get(target_tokenizer)
         if assistant_dict is None:
@@ -755,7 +845,7 @@ def get_translator(
         mapping = assistant_dict.get(assistant_tokenizer)
         if mapping is None:
             mapping = AssistantToTargetTranslator(
-                target_tokenizer, assistant_tokenizer, target_vocab_size, assistant_model_device
+                target_tokenizer, assistant_tokenizer, assistant_model, target_vocab_size, assistant_prune_LM_head
             )
             assistant_dict[assistant_tokenizer] = mapping
 
@@ -892,7 +982,7 @@ def _prepare_assistant_input_ids(self, target_input_ids: torch.LongTensor) -> to
                 self._prev_assistant_ids = self._prev_assistant_ids[:, :-tokens_to_remove]
             assistant_input_ids = torch.cat([self._prev_assistant_ids, assistant_new_ids], dim=-1)
         assistant_input_ids = assistant_input_ids.to(dtype=torch.long)
-
+        self._atm_translator.set_unmap()
         return assistant_input_ids, len(assistant_new_ids[0])
 
 

diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py
@@ -372,6 +372,8 @@ class GenerationConfig(PushToHubMixin):
             If set to a positive integer, the re-encodeing process will additionally consider the last `target_lookbehind` target tokens
             to correctly align tokens. Can only be used with different tokenizers in speculative decoding.
             See this [blog](https://huggingface.co/blog/universal_assisted_generation) for more details.
+        assistant_prune_LM_head(`bool`, *optional*, defaults to `True`):
+            If set to `True`, LM head of the assistant model will be pruned. Can only be used with different tokenizers in speculative decoding with `do_sample=True`.
 
         > Parameters related to performances and compilation
 
@@ -481,6 +483,8 @@ def __init__(self, **kwargs):
         ## assistant generation for different tokenizers, the windows size for assistant/target model
         self.assistant_lookbehind = kwargs.pop("assistant_lookbehind", 10)
         self.target_lookbehind = kwargs.pop("target_lookbehind", 10)
+        ## assistant generation for different tokenizers, pruning of the LM head of the assistant model
+        self.assistant_prune_LM_head = kwargs.pop("assistant_prune_LM_head", True)
 
         # Performance
         self.compile_config = kwargs.pop("compile_config", CompileConfig())

diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
@@ -877,8 +877,15 @@ def _get_candidate_generator(
         elif different_tokenizers:
             if generation_config.do_sample is True:
                 atm_translator = AssistantVocabTranslatorCache.get_translator(
-                    target_tokenizer, assistant_tokenizer, self.config.vocab_size, assistant_model.device
+                    target_tokenizer,
+                    assistant_tokenizer,
+                    assistant_model,
+                    self.config.vocab_size,
+                    assistant_prune_LM_head=generation_config.assistant_prune_LM_head,
                 )
+                if generation_config.assistant_prune_LM_head:
+                    # If we prune the LM head, we cannot use the repetition penalty on the assistant model due to mismaches between token ids and logits index
+                    assistant_model.generation_config.repetition_penalty = None
                 candidate_generator = UniversalSpeculativeDecodingGenerator(
                     input_ids=input_ids,
                     assistant_model=assistant_model,