update conv2d conv3d impl

Isotr0py · Isotr0py · commit 2d9f1c7ad8e0 · 2025-11-13T20:11:22.000+08:00
Signed-off-by: Isotr0py &lt;mozf@mail2.sysu.edu.cn&gt;
diff --git a/vllm/model_executor/layers/multi_modal/conv.py b/vllm/model_executor/layers/multi_modal/conv.py
@@ -2,80 +2,172 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Conv Layer Class."""
 
+import math
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.linear import ReplicatedLinear
 from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
+from vllm.model_executor.utils import set_weight_attrs
 
 
-@CustomOp.register("conv")
 class ConvLayerBase(CustomOp):
     """Conv layer base class."""
 
-    def __init__(
-        self,
-    ) -> None:
-        super().__init__()
-
-
-class Conv2dLayer(ConvLayerBase):
-    """Conv layer with Conv2d."""
+    num_dim: int
 
     def __init__(
         self,
         in_channels: int,
         out_channels: int,
-        kernel_size: int | tuple,
-        stride: int | tuple | None,
-        padding: int | tuple | str | None,
-        dilation: int | tuple | None,
-        groups: int | None,
-        bias: bool | None,
-        padding_mode: str | None,
+        kernel_size: int | tuple[int, ...],
+        stride: int | tuple[int, ...] = 1,
+        padding: int | tuple[int, ...] = 0,
+        dilation: int | tuple[int, ...] = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = "zeros",
+        *,
+        params_dtype: torch.dtype | None = None,
     ) -> None:
         super().__init__()
 
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+
+        kernel_size = (
+            (kernel_size,) * self.num_dim
+            if isinstance(kernel_size, int)
+            else kernel_size
+        )
+        stride = (stride,) * self.num_dim if isinstance(stride, int) else stride
+        padding = (padding,) * self.num_dim if isinstance(padding, int) else padding
+        dilation = (dilation,) * self.num_dim if isinstance(dilation, int) else dilation
+
         self.in_channels = in_channels
         self.out_channels = out_channels
         self.kernel_size = kernel_size
         self.stride = stride
         self.padding = padding
         self.dilation = dilation
         self.groups = groups
-        self.bias = bias
         self.padding_mode = padding_mode
 
-        self.proj = nn.Conv2d(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=padding,
-            dilation=dilation,
-            groups=groups,
-            bias=bias,
-            padding_mode=padding_mode,
+        self.can_linearize = (
+            (self.kernel_size == self.stride)
+            and not any(self.padding)
+            and self.groups == 1
         )
 
-    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.proj(x)
-        return x
+        if self.can_linearize:
+            self.weight = nn.Parameter(
+                torch.empty(
+                    out_channels,
+                    in_channels * math.prod(self.kernel_size),
+                    dtype=params_dtype,
+                ),
+            )
+        else:
+            self.weight = nn.Parameter(
+                torch.empty(
+                    out_channels,
+                    in_channels // groups,
+                    *kernel_size,
+                    dtype=params_dtype,
+                ),
+            )
+        set_weight_attrs(self.weight, {"weight_loader": self.weight_loader})
 
-    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
-        return self.forward_native(x)
+        if bias:
+            self.bias = nn.Parameter(torch.empty(self.out_channels, dtype=params_dtype))
+            set_weight_attrs(
+                self.bias,
+                {
+                    "weight_loader": self.weight_loader,
+                },
+            )
+        else:
+            self.register_parameter("bias", None)
 
     def extra_repr(self) -> str:
         s = f"in_channels={self.in_channels}, "
         s += f"out_channels={self.out_channels}, "
         s += f"kernel_size={self.kernel_size}, "
         s += f"stride={self.stride}, "
         s += f"padding={self.padding}, "
-        s += f"bias={self.bias}, "
+        s += f"bias={self.bias is not None}"
         return s
 
+    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
+        param.data.copy_(loaded_weight.view(param.shape))
+
+
+@CustomOp.register("conv2d")
+class Conv2dLayer(ConvLayerBase):
+    """Conv layer with Conv2d."""
+
+    num_dim = 2
+
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
+        """Expected input shape: (batch_size, in_channels, height, width)"""
+        assert x.dim() == 4
+        if self.can_linearize:
+            B, C, H, W = x.shape
+            K1, K2 = self.kernel_size
+            H, W = H // K1, W // K2
+            x = x.view(-1, self.in_channels * math.prod(self.kernel_size))
+            x = F.linear(x, self.weight, self.bias)
+            x = x.view(B, self.out_channels, H, W)
+        else:
+            x = F.conv2d(
+                x,
+                self.weight,
+                self.bias,
+                stride=self.stride,
+                padding=self.padding,
+                dilation=self.dilation,
+                groups=self.groups,
+            )
+        return x
+
+    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        return self.forward_native(x)
+
+
+@CustomOp.register("conv3d")
+class Conv3dLayer(ConvLayerBase):
+    """Conv layer with Conv3d."""
+
+    num_dim = 3
+
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
+        """Expected input shape: (batch_size, in_channels, time, height, width)"""
+        assert x.dim() == 5
+        if self.can_linearize:
+            B, C, T, H, W = x.shape
+            K1, K2, K3 = self.kernel_size
+            T, H, W = T // K1, H // K2, W // K3
+            x = x.view(-1, self.in_channels * math.prod(self.kernel_size))
+            x = F.linear(x, self.weight, self.bias)
+            x = x.view(B, self.out_channels, T, H, W)
+        else:
+            x = F.conv3d(
+                x,
+                self.weight,
+                self.bias,
+                stride=self.stride,
+                padding=self.padding,
+                dilation=self.dilation,
+                groups=self.groups,
+            )
+        return x
+
+    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        return self.forward_native(x)
+
 
 class CausalConv2dLayer(Conv2dLayer):
     """
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
@@ -26,7 +26,6 @@
 # limitations under the License.
 """Inference-only Qwen2.5-VL model compatible with HuggingFace weights."""
 
-import math
 from collections.abc import Callable, Iterable, Mapping, Sequence
 from functools import lru_cache, partial
 from typing import Annotated, Any, Literal, TypeAlias
@@ -63,7 +62,7 @@
     QKVParallelLinear,
     RowParallelLinear,
 )
-from vllm.model_executor.layers.multi_modal import get_conv_layer
+from vllm.model_executor.layers.multi_modal.conv import Conv3dLayer
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.module_mapping import MultiModelKeys
@@ -110,7 +109,6 @@
     maybe_prefix,
 )
 from .vision import (
-    conv3d_to_linear_weight,
     get_vit_attn_backend,
     run_dp_sharded_mrope_vision_model,
 )
@@ -555,16 +553,18 @@ def __init__(
         self.hidden_size = hidden_size
 
         kernel_size = (temporal_patch_size, patch_size, patch_size)
-        self.proj = get_conv_layer(
-            input_size=in_channels * math.prod(kernel_size),
-            output_size=hidden_size,
+        self.proj = Conv3dLayer(
+            in_channels,
+            hidden_size,
+            kernel_size=kernel_size,
+            stride=kernel_size,
             bias=False,
-            return_bias=False,
-            conv_type="linear",
         )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.proj(x)
+        L, C = x.shape
+        x = x.view(L, -1, self.temporal_patch_size, self.patch_size, self.patch_size)
+        x = self.proj(x).view(L, self.hidden_size)
         return x
 
 
@@ -988,9 +988,6 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loaded_params: set[str] = set()
 
         for name, loaded_weight in weights:
-            if name.endswith("patch_embed.proj.weight"):
-                loaded_weight = conv3d_to_linear_weight(loaded_weight)
-
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue