vllm-project
diff --git a/‎examples/big_models_with_accelerate/mult_gpus_int8_device_map.py‎
Lines changed: 2 additions & 2 deletions b/‎examples/big_models_with_accelerate/mult_gpus_int8_device_map.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/llmcompressor/modifiers/obcq/utils/helpers.py‎
Lines changed: 0 additions & 202 deletions b/‎src/llmcompressor/modifiers/obcq/utils/helpers.py‎
Lines changed: 0 additions & 202 deletions
diff --git a/‎src/llmcompressor/modifiers/smoothquant/base.py‎
Lines changed: 17 additions & 0 deletions b/‎src/llmcompressor/modifiers/smoothquant/base.py‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎src/llmcompressor/pytorch/model_load/helpers.py‎
Lines changed: 7 additions & 5 deletions b/‎src/llmcompressor/pytorch/model_load/helpers.py‎
Lines changed: 7 additions & 5 deletions
diff --git a/‎src/llmcompressor/transformers/finetune/data/base.py‎
Lines changed: 31 additions & 13 deletions b/‎src/llmcompressor/transformers/finetune/data/base.py‎
Lines changed: 31 additions & 13 deletions
@@ -7,13 +7,13 @@
 from llmcompressor.transformers import oneshot
 from llmcompressor.transformers.compression.helpers import calculate_offload_device_map
 
-MODEL_ID = "mistralai/Mistral-Nemo-Instruct-2407"
+MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct"
 
 # adjust based off number of desired GPUs
 # reserve_for_hessians=True reserves memory which is required by
 # GPTQModifier and SparseGPTModifier
 device_map = calculate_offload_device_map(
-    MODEL_ID, num_gpus=2, reserve_for_hessians=True, torch_dtype=torch.bfloat16
+    MODEL_ID, num_gpus=1, reserve_for_hessians=True, torch_dtype=torch.bfloat16
 )
 
 model = AutoModelForCausalLM.from_pretrained(
 
@@ -2,6 +2,7 @@
 from typing import Callable, Dict, List, Optional, Tuple
 
 import torch
+from compressed_tensors.utils.offload import is_module_offloaded
 from loguru import logger
 from torch.nn import Module
 
@@ -282,6 +283,10 @@ def _apply_smoothing(self, model: Module):
 
             @torch.no_grad()
             def smooth(module):
+                offloaded = is_module_offloaded(module)
+                if offloaded:
+                    module._hf_hook.pre_forward(module)
+
                 if module in balance_layers:
                     module.weight.mul_(scales.view(1, -1))
                 elif module == smooth_layer:
@@ -292,6 +297,9 @@ def smooth(module):
                     if hasattr(module, "bias") and module.bias is not None:
                         module.bias.div_(scales)
 
+                if offloaded:
+                    module._hf_hook.post_forward(module, None)
+
             parent = get_fsdp_parent(mapping.smooth_name, model)
             if parent is not None:
                 parent.apply(smooth)
@@ -318,8 +326,16 @@ def _calculate_smoothing_scales(
         # get the channel-wise dynamic range for each layer to be balanced
         weight_scales = []
         for layer in balance_layers:
+            offloaded = is_module_offloaded(layer)
+            if offloaded:
+                layer._hf_hook.pre_forward(layer)
+
             scale = layer.weight.abs().max(dim=0, keepdim=True)[0]
             weight_scales.append(scale)
+
+            if offloaded:
+                layer._hf_hook.post_forward(layer, None)
+
         weight_scales = 2.0 * torch.cat(weight_scales, dim=0).max(dim=0)[0]
 
         # calculate the amount of smoothing to apply
@@ -329,4 +345,5 @@ def _calculate_smoothing_scales(
             1 - self.smoothing_strength
         )
         scales = torch.where(weight_scales > 0.0, scales, activation_scales)
+
         return scales
@@ -9,6 +9,7 @@
 
 from llmcompressor.core import active_session, create_session, pre_initialize_structure
 from llmcompressor.pytorch.utils import ModuleSparsificationInfo
+from llmcompressor.typing import Processor
 
 COMPLETED_STAGES_FILENAME = "completed_stages.json"
 
@@ -92,15 +93,16 @@ def initialize_recipe(model: Module, recipe_path: str):
 def save_model_and_recipe(
     model: Module,
     save_path: str,
-    tokenizer: Optional[Any] = None,
+    processor: Optional[Processor] = None,
     save_safetensors: bool = False,
     save_compressed: bool = False,
 ):
     """
-    Save a model, tokenizer and the currently loaded recipe to file
+    Save a model, processor and the currently loaded recipe to file
+
     :param model: pytorch model to save
     :param save_path: path to save output to
-    :param tokenizer: model tokenizer to save
+    :param processor: model processor or tokenizer to save
     :param save_safetensors: whether to save as safetensors or pickle (bin)
     :param save_compressed: whether to compress sparse weights on disk
     """
@@ -111,8 +113,8 @@ def save_model_and_recipe(
         save_path, save_compressed=save_compressed, safe_serialization=save_safetensors
     )
 
-    if tokenizer is not None:
-        tokenizer.save_pretrained(save_path)
+    if processor is not None:
+        processor.save_pretrained(save_path)
 
     logger.info("Saving output to {}".format(os.path.abspath(save_path)))
 
 
@@ -3,14 +3,14 @@
 from compressed_tensors.registry import RegistryMixin
 from datasets import Dataset, IterableDataset
 from loguru import logger
-from transformers import AutoTokenizer
 
 from llmcompressor.transformers.finetune.data.data_args import DataTrainingArguments
 from llmcompressor.transformers.finetune.data.data_helpers import (
     LABELS_MASK_VALUE,
     get_custom_datasets_from_path,
     get_raw_dataset,
 )
+from llmcompressor.typing import Processor
 
 
 class TextGenerationDataset(RegistryMixin):
@@ -30,10 +30,10 @@ def __init__(
         text_column: str,
         data_args: DataTrainingArguments,
         split: str,
-        tokenizer: AutoTokenizer,
+        processor: Processor,
     ):
         self.text_column = text_column
-        self.tokenizer = tokenizer
+        self.processor = processor
         self.data_args = data_args
         self.raw_kwargs = data_args.raw_kwargs or {}
         self.split = split
@@ -50,20 +50,38 @@ def __init__(
         else:
             self.padding = False
 
-        if self.tokenizer:
+        # get tokenizer
+        self.tokenizer = getattr(self.processor, "tokenizer", self.processor)
+
+        if self.tokenizer is not None:
+            # fill in pad token
             if not self.tokenizer.pad_token:
                 self.tokenizer.pad_token = self.tokenizer.eos_token
 
-        # configure sequence length
-        max_seq_length = data_args.max_seq_length
-        model_max_length = tokenizer.model_max_length if tokenizer else max_seq_length
-        if self.tokenizer and max_seq_length > model_max_length:
-            logger.warning(
-                f"The max_seq_length passed ({max_seq_length}) is larger than "
-                f"the maximum length for the model ({tokenizer.model_max_length}). "
-                f"Using max_seq_length={tokenizer.model_max_length}."
+            # configure sequence length
+            max_seq_length = data_args.max_seq_length
+            if data_args.max_seq_length > self.tokenizer.model_max_length:
+                logger.warning(
+                    f"The max_seq_length passed ({max_seq_length}) is larger than "
+                    f"maximum length for model ({self.tokenizer.model_max_length}). "
+                    f"Using max_seq_length={self.tokenizer.model_max_length}."
+                )
+            self.max_seq_length = min(
+                data_args.max_seq_length, self.tokenizer.model_max_length
+            )
+
+            # configure padding
+            self.padding = (
+                False
+                if self.data_args.concatenate_data
+                else "max_length"
+                if self.data_args.pad_to_max_length
+                else False
             )
-        self.max_seq_length = min(data_args.max_seq_length, model_max_length)
+
+        else:
+            self.max_seq_length = None
+            self.padding = False
 
     def get_raw_dataset(self, cache_dir: Optional[str] = None) -> Dataset:
         """