🚨🚨🚨 Fix sdpa in SAM and refactor relative position embeddings (huggingface#36422)

geetu040 · kmehant · commit 2cf3ab354b89 · 2025-03-17T16:40:17.000+05:30
* fall back to eager if output_attentions

* improve relative position embeddings

* run modular on got_ocr2

* run-slow: sam

* fix run-length encoding

* fix tf processor errors

* update tf_sam

* fix compile error

* re-run tests

Signed-off-by: Mehant Kammakomati &lt;mehant.kammakomati2@ibm.com&gt;
diff --git a/src/transformers/integrations/tensor_parallel.py b/src/transformers/integrations/tensor_parallel.py
@@ -417,6 +417,76 @@ def partition_tensor(self, param, empty_param, param_type, param_casting_dtype,
             parameter = DTensor.from_local(parameter, device_mesh, [Shard(-1)], run_check=False)
         return nn.Parameter(parameter)
 
+class ReplicateParallel(TensorParallelLayer):
+    """
+    Replicate a nn.Module.
+    Users can compose it together with other parallel styles like RowwiseParallel to achieve a fully distributed model.
+    Fully distributed model is needed for gradient clipping.
+
+    Keyword Args:
+        input_layouts (Placement, optional):
+            The DTensor layout of input tensor for the nn.Module, this is used to annotate the input tensor to
+            become a DTensor. If not specified, we assume the input tensor to be replicated.
+        output_layouts (Placement, optional):
+            The DTensor layout of the output for the nn.Module, this is used to ensure the output of the nn.Module
+            with the user desired layout. If not specified, we assume the output tensor to be replicated.
+        use_local_output (bool, optional):
+            Whether to use local :class:`torch.Tensor` instead of :class:`DTensor` for the module output, default: True.
+    Returns:
+        A :class:`ParallelStyle` object that represents replication of nn.Module.
+
+    Example::
+        >>> # xdoctest: +SKIP(failing)
+        >>> from torch.distributed.tensor.parallel import parallelize_module, ReplicateParallel
+        >>> from torch.distributed.device_mesh import init_device_mesh
+        >>> ...
+        >>> m = Model(...)  # m is a nn.Module that contains a "w1" nn.Linear submodule
+        >>> tp_mesh = init_device_mesh("cuda", (8,))
+        >>>
+        >>> # By default, the input and output of the "w1" Linear will be converted to Replicated DTensor
+        >>>
+        >>> replicated_mod = parallelize_module(m, tp_mesh, {"w1": ReplicateParallel()})
+        >>> ...
+
+    """
+
+
+    def __init__(
+        self,
+        *,
+        input_layouts: Optional[Placement] = None,
+        output_layouts: Optional[Placement] = None,
+        use_local_output: bool = True,
+        use_dtensor=True,
+    ):
+
+        super().__init__()
+        self.input_layouts = (input_layouts or Replicate(),)
+        self.output_layouts = (output_layouts or Replicate(),)
+        self.desired_input_layouts = (Replicate(),)
+        self.use_local_output = use_local_output
+        self.use_dtensor = use_dtensor
+
+    @staticmethod
+    def _prepare_input_fn(input_layouts, desired_input_layouts, mod, inputs, device_mesh):
+        # since nn.Linear and nn.Embedding have single input
+        # we may extend support to other modules since its replicate.
+        input_tensor = inputs[0]
+        if isinstance(input_tensor, torch.distributed._functional_collectives.AsyncCollectiveTensor):
+            input_tensor = input_tensor.trigger_wait()
+        if not isinstance(input_tensor, DTensor):
+            input_tensor = DTensor.from_local(input_tensor, device_mesh, input_layouts, run_check=False)
+
+        if input_layouts != desired_input_layouts:
+            input_tensor = input_tensor.redistribute(placements=desired_input_layouts, async_op=True)
+        return input_tensor
+
+
+    @staticmethod
+    def _prepare_output_fn(output_layouts, use_local_output, mod, outputs, device_mesh):
+        if outputs.placements != output_layouts:
+            outputs = outputs.redistribute(placements=output_layouts, async_op=True)
+        return outputs.to_local() if use_local_output else outputs
 
 SUPPORTED_TP_STYLES = {
     "colwise",
@@ -428,6 +498,8 @@ def partition_tensor(self, param, empty_param, param_type, param_casting_dtype,
     "local",
     "gather",
     "local_packed_rowwise",
+    "replicate",
+    "replicate_output_dtensor"
 }
 
 
@@ -459,6 +531,10 @@ def translate_to_torch_parallel_style(style: str):
         return GatherParallel()
     elif style == "local_packed_rowwise":
         return PackedRowwiseParallel(use_dtensor=False)
+    elif style == "replicate":
+        return ReplicateParallel()
+    elif style == "replicate_output_dtensor":
+        return ReplicateParallel(use_local_output=False)
     else:
         raise ValueError(f"Unsupported parallel style value: {style}")
 
diff --git a/src/transformers/models/granite/configuration_granite.py b/src/transformers/models/granite/configuration_granite.py
@@ -117,10 +117,14 @@ class GraniteConfig(PretrainedConfig):
         "layers.*.self_attn.q_proj": "colwise",
         "layers.*.self_attn.k_proj": "colwise",
         "layers.*.self_attn.v_proj": "colwise",
-        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.self_attn.o_proj": "rowwise_output_dtensor",
         "layers.*.mlp.gate_proj": "colwise",
         "layers.*.mlp.up_proj": "colwise",
-        "layers.*.mlp.down_proj": "rowwise",
+        "layers.*.mlp.down_proj": "rowwise_output_dtensor",
+        "embed_tokens": "replicate_output_dtensor",
+        "layers.*.post_attention_layernorm": "replicate_output_dtensor",
+        "layers.*.input_layernorm": "replicate_output_dtensor",
+        "norm": "replicate_output_dtensor",
     }
     base_model_pp_plan = {
         "embed_tokens": (["input_ids"], ["inputs_embeds"]),
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
@@ -235,6 +235,7 @@
         AutocastKwargs,
         DistributedDataParallelKwargs,
         DistributedType,
+        TorchTensorParallelPlugin,
         load_fsdp_model,
         load_fsdp_optimizer,
         save_fsdp_model,
@@ -2317,7 +2318,9 @@ def _inner_training_loop(
             else:
                 debug_overflow = DebugUnderflowOverflow(self.model)  # noqa
 
-        delay_optimizer_creation = is_sagemaker_mp_enabled() or self.is_fsdp_xla_enabled or self.is_fsdp_enabled
+        delay_optimizer_creation = (
+            is_sagemaker_mp_enabled() or self.is_fsdp_xla_enabled or self.is_fsdp_enabled or self.is_tp_enabled
+        )
 
         # We need to reset the scheduler, as its parameters may be different on subsequent calls
         if self._created_lr_scheduler:
@@ -2372,7 +2375,10 @@ def _inner_training_loop(
                 if self.use_apex:
                     model = self.accelerator.prepare(self.model)
                 else:
-                    model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
+                    if delay_optimizer_creation:
+                        self.optimizer = self.accelerator.prepare(self.optimizer)
+                    else:
+                        model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
             else:
                 # to handle cases wherein we pass "DummyScheduler" such as when it is specified in DeepSpeed config.
                 model, self.optimizer, self.lr_scheduler = self.accelerator.prepare(