Xcodec fix (#42095)

eustlb · web-flow · commit f13b1009c2e2 · 2025-11-25T14:39:01.000Z
* nit on dac!

* fix

* not for this pr

* make style
diff --git a/src/transformers/audio_utils.py b/src/transformers/audio_utils.py
@@ -219,6 +219,18 @@ def load_audio_as(
         raise ValueError(f"Error loading audio: {e}")
 
 
+def conv1d_output_length(module: "torch.nn.Conv1d", input_length: int) -> int:
+    """
+    Computes the output length of a 1D convolution layer according to torch's documentation:
+    https://docs.pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+    """
+    return int(
+        (input_length + 2 * module.padding[0] - module.dilation[0] * (module.kernel_size[0] - 1) - 1)
+        / module.stride[0]
+        + 1
+    )
+
+
 def is_valid_audio(audio):
     return is_numpy_array(audio) or is_torch_tensor(audio)
 
diff --git a/src/transformers/models/xcodec/modeling_xcodec.py b/src/transformers/models/xcodec/modeling_xcodec.py
@@ -16,13 +16,15 @@
 
 import math
 from dataclasses import dataclass
+from functools import lru_cache
 from typing import Optional, Union
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
 from ... import initialization as init
+from ...audio_utils import conv1d_output_length
 from ...modeling_utils import PreTrainedAudioTokenizerBase
 from ...utils import ModelOutput, auto_docstring
 from ..auto import AutoModel
@@ -396,6 +398,40 @@ def remove_weight_norm(self):
                 if hasattr(m, "parametrizations") and "weight" in m.parametrizations:
                     torch.nn.utils.parametrize.remove_parametrizations(m, "weight", leave_parametrized=True)
 
+    @lru_cache
+    def _get_conv1d_layers(self, module):
+        """
+        Recursively iterate to fetch all Conv1d layers.
+        """
+
+        def get_conv1d_layers_recursive(module: nn.Module):
+            params_list = []
+
+            if isinstance(module, nn.Conv1d):
+                params_list.append(module)
+
+            # Recursively check all child modules
+            for child in module.children():
+                params_list.extend(get_conv1d_layers_recursive(child))
+
+            return params_list
+
+        return tuple(get_conv1d_layers_recursive(module))
+
+    def _get_conv1d_output_lengths(self, input_length, module=None):
+        """
+        For a given module, compute the output length that would be obtained after all Conv1d layers.
+        """
+        if module is None:
+            module = self
+
+        conv1d_layers = self._get_conv1d_layers(module)
+
+        for layer in conv1d_layers:
+            input_length = conv1d_output_length(layer, input_length)
+
+        return input_length
+
 
 @auto_docstring(custom_intro="""The Xcodec neural audio codec model.""")
 class XcodecModel(XcodecPreTrainedModel):
@@ -476,11 +512,13 @@ def encode(
 
         e_semantic_input = self._extract_semantic_features(input_values).detach()
         e_semantic = self.encoder_semantic(e_semantic_input.transpose(1, 2))
-        e_acoustic = self.acoustic_encoder(input_values)
 
-        if e_acoustic.shape[2] != e_semantic.shape[2]:
-            # make sure they line up if frames don't match
-            e_acoustic = self.acoustic_encoder(F.pad(input_values[:, 0, :], (self.pad, self.pad)).unsqueeze(1))
+        # orignal codebase infer to get the output length, but we can directly infer it
+        # from the model and know wether we should pad
+        if self._get_conv1d_output_lengths(input_values.shape[2], self.acoustic_encoder) != e_semantic.shape[2]:
+            e_acoustic = self.acoustic_encoder(F.pad(input_values, (self.pad, self.pad)))
+        else:
+            e_acoustic = self.acoustic_encoder(input_values)
 
         embeddings = torch.cat([e_acoustic, e_semantic], dim=1)
         embeddings = self.fc(embeddings.transpose(1, 2)).transpose(1, 2)