From 2986dc21201fe1a687badd62d2be667d6b335ffe Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sun, 3 Nov 2024 10:48:55 +0500
Subject: [PATCH 001/151] implement config and model building blocks

---
 .../depth_pro/configuration_depth_pro.py      |  167 ++
 .../models/depth_pro/modeling_depth_pro.py    | 1404 +++++++++++++++++
 2 files changed, 1571 insertions(+)
 create mode 100644 src/transformers/models/depth_pro/configuration_depth_pro.py
 create mode 100644 src/transformers/models/depth_pro/modeling_depth_pro.py

diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py
new file mode 100644
index 000000000000..ad0f1016f7a1
--- /dev/null
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@@ -0,0 +1,167 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""DepthPro model configuration"""
+
+from collections import OrderedDict
+from typing import Mapping
+
+from packaging import version
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.onnx import OnnxConfig
+from transformers.utils import logging
+from transformers.utils.backbone_utils import get_aligned_output_features_output_indices
+
+
+logger = logging.get_logger(__name__)
+
+
+class DepthProConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DepthProModel`]. It is used to instantiate a
+    DepthPro model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the DepthPro
+    [apple/DepthPro](https://huggingface.co/apple/DepthPro) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 1024):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        mlp_ratio (`int`, *optional*, defaults to 4):
+            Ratio of the hidden size of the MLPs relative to the `hidden_size`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 14):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+        layerscale_value (`float`, *optional*, defaults to 1.0):
+           Initial value to use for layer scale.
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            Stochastic depth rate per sample (when applied in the main path of residual layers).
+        use_swiglu_ffn (`bool`, *optional*, defaults to `False`):
+            Whether to use the SwiGLU feedforward neural network.
+        out_features (`List[str]`, *optional*):
+            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+        out_indices (`List[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+        apply_layernorm (`bool`, *optional*, defaults to `True`):
+            Whether to apply layer normalization to the feature maps in case the model is used as backbone.
+        reshape_hidden_states (`bool`, *optional*, defaults to `True`):
+            Whether to reshape the feature maps to 4D tensors of shape `(batch_size, hidden_size, height, width)` in
+            case the model is used as backbone. If `False`, the feature maps will be 3D tensors of shape `(batch_size,
+            seq_len, hidden_size)`.
+
+    Example:
+
+    ```python
+    >>> from transformers import DepthProConfig, DepthProModel
+
+    >>> # Initializing a DepthPro apple/DepthPro style configuration
+    >>> configuration = DepthProConfig()
+
+    >>> # Initializing a model (with random weights) from the apple/DepthPro style configuration
+    >>> model = DepthProModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "depth_pro"
+
+    def __init__(
+        self,
+        hidden_size=1024, # changed
+        decoder_hidden_size=256,
+        num_hidden_layers=24, # changed
+        num_attention_heads=16,
+        mlp_ratio=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        initializer_range=0.02,
+        layer_norm_eps=1e-6,
+        image_size=384,
+        patch_size=16, # changed
+        num_channels=3,
+        qkv_bias=True,
+        layerscale_value=1.0,
+        drop_path_rate=0.0,
+        use_swiglu_ffn=False,
+        out_features=None,
+        out_indices=None,
+        apply_layernorm=True,
+        reshape_hidden_states=True,
+        patch_encoder_hook_ids = [5, 11],
+        # patch_encoder_hook_ids = [5, 11, 17, 23],
+        patch_encoder_feature_dims = [256, 512, 1024, 1024],
+        use_batch_norm_in_decoder=False,
+        use_fov=False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.decoder_hidden_size = decoder_hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.mlp_ratio = mlp_ratio
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.qkv_bias = qkv_bias
+        self.layerscale_value = layerscale_value
+        self.drop_path_rate = drop_path_rate
+        self.use_swiglu_ffn = use_swiglu_ffn
+        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, num_hidden_layers + 1)]
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
+        self.apply_layernorm = apply_layernorm
+        self.reshape_hidden_states = reshape_hidden_states
+        self.patch_encoder_hook_ids = patch_encoder_hook_ids
+        self.patch_encoder_feature_dims = patch_encoder_feature_dims
+        self.use_batch_norm_in_decoder = use_batch_norm_in_decoder
+        self.use_fov = use_fov
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
new file mode 100644
index 000000000000..f73b74060f57
--- /dev/null
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -0,0 +1,1404 @@
+# coding=utf-8
+# Copyright 2023 Meta AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch DepthPro model."""
+
+from icecream import ic
+
+import collections.abc
+import math
+from typing import Dict, List, Optional, Set, Tuple, Union
+
+import torch
+from torch import nn
+from dataclasses import dataclass
+
+from ...activations import ACT2FN
+from ...modeling_outputs import (
+    BaseModelOutput,
+)
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+    torch_int,
+)
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from .configuration_depth_pro import DepthProConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2PatchEmbeddings with Dinov2->DepthProViT
+class DepthProViTPatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        image_size, patch_size = config.image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.hidden_size
+
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+
+        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        num_channels = pixel_values.shape[1]
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+                f" Expected {self.num_channels} but got {num_channels}."
+            )
+        embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2)
+        return embeddings
+
+
+# Copied from transformers.models.dinov2.modeling_dinov2.DepthProViTEmbeddings
+# with DepthProViT->DepthProViT and antialias=True in interpolation
+class DepthProViTEmbeddings(nn.Module):
+    """
+    Construct the CLS token, position and patch embeddings.
+    """
+
+    def __init__(self, config: DepthProConfig) -> None:
+        super().__init__()
+
+        self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size))
+        self.patch_embeddings = DepthProViTPatchEmbeddings(config)
+        num_patches = self.patch_embeddings.num_patches
+        self.position_embeddings = nn.Parameter(torch.randn(1, num_patches + 1, config.hidden_size))
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.patch_size = config.patch_size
+        self.config = config
+
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
+        images. This method is also adapted to support torch.jit tracing and interpolation at torch.float32 precision.
+
+        Adapted from:
+        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
+        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
+        """
+
+        num_patches = embeddings.shape[1] - 1
+        num_positions = self.position_embeddings.shape[1] - 1
+
+        # always interpolate when tracing to ensure the exported model works for dynamic input shapes
+        if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
+            return self.position_embeddings
+
+        class_pos_embed = self.position_embeddings[:, :1]
+        patch_pos_embed = self.position_embeddings[:, 1:]
+
+        dim = embeddings.shape[-1]
+
+        new_height = height // self.patch_size
+        new_width = width // self.patch_size
+
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+        target_dtype = patch_pos_embed.dtype
+
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.to(torch.float32),
+            size=(new_height, new_width),
+            mode="bicubic",
+            align_corners=False,
+            antialias=True, # except for this, the class is same as transformers.models.dinov2.modeling_dinov2.DepthProPatchEmbeddings
+        ).to(dtype=target_dtype)
+
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+
+        return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        batch_size, _, height, width = pixel_values.shape
+        target_dtype = self.patch_embeddings.projection.weight.dtype
+        embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype))
+
+        # add the [CLS] token to the embedded patch tokens
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+
+        # add positional encoding to each token
+        embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->DepthProViT
+class DepthProViTSelfAttention(nn.Module):
+    def __init__(self, config: DepthProConfig) -> None:
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"heads {config.num_attention_heads}."
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        mixed_query_layer = self.query(hidden_states)
+
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2SelfAttention with Dinov2->DepthProViT
+class DepthProViTSdpaSelfAttention(DepthProViTSelfAttention):
+    def __init__(self, config: DepthProConfig) -> None:
+        super().__init__(config)
+        self.attention_probs_dropout_prob = config.attention_probs_dropout_prob
+
+    def forward(
+        self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "DepthProModel is using DepthProViTSdpaSelfAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states, head_mask=head_mask, output_attentions=output_attentions
+            )
+
+        mixed_query_layer = self.query(hidden_states)
+
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        context_layer = torch.nn.functional.scaled_dot_product_attention(
+            query_layer,
+            key_layer,
+            value_layer,
+            head_mask,
+            self.attention_probs_dropout_prob if self.training else 0.0,
+            is_causal=False,
+            scale=None,
+        )
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        return context_layer, None
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->DepthProViT
+class DepthProViTSelfOutput(nn.Module):
+    """
+    The residual connection is defined in DepthProViTLayer instead of here (as is the case with other models), due to the
+    layernorm applied before each block.
+    """
+
+    def __init__(self, config: DepthProConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        return hidden_states
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViT->DepthProViT
+class DepthProViTAttention(nn.Module):
+    def __init__(self, config: DepthProConfig) -> None:
+        super().__init__()
+        self.attention = DepthProViTSelfAttention(config)
+        self.output = DepthProViTSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads: Set[int]) -> None:
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        self_outputs = self.attention(hidden_states, head_mask, output_attentions)
+
+        attention_output = self.output(self_outputs[0], hidden_states)
+
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTSdpaAttention with ViT->DepthProViT
+class DepthProViTSdpaAttention(DepthProViTAttention):
+    def __init__(self, config: DepthProConfig) -> None:
+        super().__init__(config)
+        self.attention = DepthProViTSdpaSelfAttention(config)
+
+
+# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2SdpaAttention with Dinov2->DepthProViT
+class DepthProViTLayerScale(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        self.lambda1 = nn.Parameter(config.layerscale_value * torch.ones(config.hidden_size))
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        return hidden_state * self.lambda1
+
+
+# Copied from transformers.models.beit.modeling_beit.drop_path
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+
+
+# Copied from transformers.models.beit.modeling_beit.BeitDropPath
+class DepthProViTDropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return "p={}".format(self.drop_prob)
+
+
+# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2MLP with Dinov2->DepthProViT
+class DepthProViTMLP(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        in_features = out_features = config.hidden_size
+        hidden_features = int(config.hidden_size * config.mlp_ratio)
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=True)
+        if isinstance(config.hidden_act, str):
+            self.activation = ACT2FN[config.hidden_act]
+        else:
+            self.activation = config.hidden_act
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=True)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.fc1(hidden_state)
+        hidden_state = self.activation(hidden_state)
+        hidden_state = self.fc2(hidden_state)
+        return hidden_state
+
+
+# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2SwiGLUFFN with Dinov2->DepthProViT
+class DepthProViTSwiGLUFFN(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        in_features = out_features = config.hidden_size
+        hidden_features = int(config.hidden_size * config.mlp_ratio)
+        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
+
+        self.weights_in = nn.Linear(in_features, 2 * hidden_features, bias=True)
+        self.weights_out = nn.Linear(hidden_features, out_features, bias=True)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.weights_in(hidden_state)
+        x1, x2 = hidden_state.chunk(2, dim=-1)
+        hidden = nn.functional.silu(x1) * x2
+        return self.weights_out(hidden)
+
+
+DEPTHPROVIT_ATTENTION_CLASSES = {
+    "eager": DepthProViTAttention,
+    "sdpa": DepthProViTSdpaAttention,
+}
+
+
+# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2Layer with Dinov2->DepthProViT
+class DepthProViTLayer(nn.Module):
+    """This corresponds to the Block class in the original implementation."""
+
+    def __init__(self, config: DepthProConfig) -> None:
+        super().__init__()
+
+        self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.attention = DEPTHPROVIT_ATTENTION_CLASSES[config._attn_implementation](config)
+        self.layer_scale1 = DepthProViTLayerScale(config)
+        self.drop_path = DepthProViTDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
+
+        self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        if config.use_swiglu_ffn:
+            self.mlp = DepthProViTSwiGLUFFN(config)
+        else:
+            self.mlp = DepthProViTMLP(config)
+        self.layer_scale2 = DepthProViTLayerScale(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        self_attention_outputs = self.attention(
+            self.norm1(hidden_states),  # in DepthProViT, layernorm is applied before self-attention
+            head_mask,
+            output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+
+        attention_output = self.layer_scale1(attention_output)
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        # first residual connection
+        hidden_states = self.drop_path(attention_output) + hidden_states
+
+        # in DepthProViT, layernorm is also applied after self-attention
+        layer_output = self.norm2(hidden_states)
+        layer_output = self.mlp(layer_output)
+        layer_output = self.layer_scale2(layer_output)
+
+        # second residual connection
+        layer_output = self.drop_path(layer_output) + hidden_states
+
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViT->DepthProViT
+class DepthProViTEncoder(nn.Module):
+    def __init__(self, config: DepthProConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([DepthProViTLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, BaseModelOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    layer_module.__call__,
+                    hidden_states,
+                    layer_head_mask,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions)
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class DepthProViT(nn.Module):
+    def __init__(self, config: DepthProConfig):
+        super().__init__()
+        self.config = config
+
+        self.embeddings = DepthProViTEmbeddings(config)
+        self.encoder = DepthProViTEncoder(config)
+
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        embedding_output = self.embeddings(pixel_values)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+
+        if not return_dict:
+            head_outputs = (sequence_output,)
+            return head_outputs + encoder_outputs[1:]
+
+        return BaseModelOutput(
+            last_hidden_state=sequence_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class DepthProEncoder(nn.Module):
+    def __init__(self, config: DepthProConfig) -> None:
+        super().__init__()
+        self.config = config
+
+        self.out_size = 24 # TODO: image_size // patch_size
+
+        # patch encoder
+        self.patch_encoder = DepthProViT(config)
+        self.patch_encoder.encoder.layer[config.patch_encoder_hook_ids[0]].register_forward_hook(
+            self._intermediate0_hook
+        )
+        self.patch_encoder.encoder.layer[config.patch_encoder_hook_ids[1]].register_forward_hook(
+            self._intermediate1_hook
+        )
+
+        # image encoder
+        self.image_encoder = DepthProViT(config)
+
+        # upsampling features (1-2)
+        self.upsample_intermediate0 = self._create_project_upsample_block(
+            input_dims=config.hidden_size,
+            intermediate_dims=config.patch_encoder_feature_dims[0],
+            output_dims=config.decoder_hidden_size,
+            n_upsample_layers=3,
+        )
+        self.upsample_intermediate1 = self._create_project_upsample_block(
+            input_dims=config.hidden_size,
+            output_dims=config.patch_encoder_feature_dims[0],
+            n_upsample_layers=2,
+        )
+
+        # upsampling features (3-5)
+        self.upsample_high_res = self._create_project_upsample_block(
+            input_dims=config.hidden_size,
+            output_dims=config.patch_encoder_feature_dims[1],
+            n_upsample_layers=1,
+        )
+        self.upsample_med_res = self._create_project_upsample_block(
+            input_dims=config.hidden_size,
+            output_dims=config.patch_encoder_feature_dims[2],
+            n_upsample_layers=1,
+        )
+        self.upsample_low_res = self._create_project_upsample_block(
+            input_dims=config.hidden_size,
+            output_dims=config.patch_encoder_feature_dims[3],
+            n_upsample_layers=1,
+        )
+
+        # upsampling features (6)
+        self.upsample_image = nn.ConvTranspose2d(
+            in_channels=config.hidden_size,
+            out_channels=config.patch_encoder_feature_dims[3],
+            kernel_size=2,
+            stride=2,
+            padding=0,
+            bias=True,
+        )
+        self.fuse_image_with_low_res = nn.Conv2d(
+            in_channels=(config.patch_encoder_feature_dims[3] + config.patch_encoder_feature_dims[3]),
+            out_channels=config.patch_encoder_feature_dims[3],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True,
+        )
+
+    def _intermediate0_hook(self, model, input, output):
+        self.intermediate0_hidden_states = output[0]
+
+    def _intermediate1_hook(self, model, input, output):
+        self.intermediate1_hidden_states = output[0]
+
+    def _create_project_upsample_block(
+        self,
+        input_dims: int,
+        output_dims: int,
+        n_upsample_layers: int,
+        intermediate_dims: Optional[int] = None,
+    ) -> nn.Module:
+        
+        intermediate_dims = intermediate_dims or output_dims
+
+        # Projection block followed by upsampling blocks.
+        blocks = [
+            nn.Conv2d(input_dims, intermediate_dims, kernel_size=1, stride=1, padding=0, bias=False)
+        ] + [
+            nn.ConvTranspose2d(
+                in_channels=(intermediate_dims if i == 0 else output_dims),
+                out_channels=output_dims,
+                kernel_size=2,
+                stride=2,
+                padding=0,
+                bias=False
+            ) for i in range(n_upsample_layers)
+        ]
+
+        return nn.Sequential(*blocks)
+
+    def _interpolate(self, pixel_values, scale_factor):
+        return nn.functional.interpolate(
+            pixel_values,
+            size=None,
+            scale_factor=scale_factor,
+            mode="bilinear",
+            align_corners=False,
+        )
+
+    def _patch(self, pixel_values, overlap_ratio):
+        patch_size = 384 # TODO: this should be infered
+        patch_stride = int(patch_size * (1 - overlap_ratio))
+
+        image_size = pixel_values.shape[-1]
+        steps = int(math.ceil((image_size - patch_size) / patch_stride)) + 1
+
+        x_patch_list = []
+        for j in range(steps):
+            j0 = j * patch_stride
+            j1 = j0 + patch_size
+
+            for i in range(steps):
+                i0 = i * patch_stride
+                i1 = i0 + patch_size
+                x_patch_list.append(pixel_values[..., j0:j1, i0:i1])
+
+        return torch.cat(x_patch_list, dim=0)
+
+    def _reshape_feature(
+        self, hidden_states: torch.Tensor, width, height, cls_token_offset=1
+    ):
+        """Discard class token and reshape 1D feature map to a 2D grid."""
+        b, hw, c = hidden_states.shape
+
+        # Remove class token.
+        if cls_token_offset > 0:
+            hidden_states = hidden_states[:, cls_token_offset:, :]
+
+        # Shape: (batch, height, width, dim) -> (batch, dim, height, width)
+        hidden_states = hidden_states.reshape(b, height, width, c).permute(0, 3, 1, 2)
+        return hidden_states
+
+    def _merge(self, x: torch.Tensor, batch_size: int, padding: int = 3) -> torch.Tensor:
+        """Merge the patched input into a image with sliding window."""
+        steps = int(math.sqrt(x.shape[0] // batch_size))
+
+        idx = 0
+
+        output_list = []
+        for j in range(steps):
+            output_row_list = []
+            for i in range(steps):
+                output = x[batch_size * idx : batch_size * (idx + 1)]
+
+                if j != 0:
+                    output = output[..., padding:, :]
+                if i != 0:
+                    output = output[..., :, padding:]
+                if j != steps - 1:
+                    output = output[..., :-padding, :]
+                if i != steps - 1:
+                    output = output[..., :, :-padding]
+
+                output_row_list.append(output)
+                idx += 1
+
+            output_row = torch.cat(output_row_list, dim=-1)
+            output_list.append(output_row)
+        output = torch.cat(output_list, dim=-2)
+        return output
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        batch_size = pixel_values.shape[0]
+
+        # STEP 1: create 3-level image
+
+        high_res = pixel_values
+        med_res = self._interpolate(pixel_values, 0.5)
+        low_res = self._interpolate(pixel_values, 0.25)
+
+        # STEP 2: create patches
+
+        high_res_patches = self._patch(high_res, 0.25)
+        med_res_patches = self._patch(med_res, 0.5)
+        low_res_patches = low_res
+
+        patches = torch.cat(
+            (high_res_patches, med_res_patches, low_res_patches),
+            dim=0,
+        )
+
+        # STEP 3: apply patch encoder
+
+        patch_encodings = self.patch_encoder(
+            patches,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+        )
+        patch_features = patch_encodings[0]
+        patch_features = self._reshape_feature(
+            patch_features, self.out_size, self.out_size
+        )
+
+        # STEP 4: Get Intermediate Features (features 1 and 2)
+
+        intermediate0_features = self._reshape_feature(
+            self.intermediate0_hidden_states,
+            self.out_size,
+            self.out_size,
+        )
+        intermediate1_features = self._reshape_feature(
+            self.intermediate1_hidden_states,
+            self.out_size,
+            self.out_size,
+        )
+        intermediate0_features = self._merge(
+            intermediate0_features[: batch_size * 5 * 5], batch_size=batch_size, padding=3
+        )
+        intermediate1_features = self._merge(
+            intermediate1_features[: batch_size * 5 * 5], batch_size=batch_size, padding=3
+        )
+
+        # STEP 5: Get Patch Encoder Features (features 3-5)
+
+        high_res_features, med_res_features, low_res_features = torch.split(
+            patch_features,
+            [len(high_res_patches), len(med_res_patches), len(low_res_patches)],
+            dim=0,
+        )
+
+        high_res_features = self._merge(high_res_features, batch_size=batch_size, padding=3)
+        med_res_features = self._merge(med_res_features, batch_size=batch_size, padding=6)
+        low_res_features = low_res_features
+
+        # STEP 6: Get Image Encoder Features (features 6)
+
+        image_encodings = self.image_encoder(
+            pixel_values=low_res_patches,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+        )
+        image_features = image_encodings[0]
+        image_features = self._reshape_feature(
+            image_features, self.out_size, self.out_size
+        )
+
+        # STEP 7: Upsample All Features (feature 1-6)
+
+        # feature (1-2)
+        intermediate0_features = self.upsample_intermediate0(
+            intermediate0_features
+        )
+        intermediate1_features = self.upsample_intermediate1(
+            intermediate1_features
+        )
+
+        # feature (3-5)
+        high_res_features = self.upsample_high_res(high_res_features)
+        med_res_features = self.upsample_med_res(med_res_features)
+        low_res_features = self.upsample_low_res(low_res_features)
+
+        # feature (6)
+        image_features = self.upsample_image(image_features)
+        image_features = self.fuse_image_with_low_res(
+            torch.cat((low_res_features, image_features), dim=1)
+        )
+
+        last_hidden_state =  [
+            intermediate0_features,
+            intermediate1_features,
+            high_res_features,
+            med_res_features,
+            # low_res_features,
+            image_features, # fused with low_res_features
+        ]
+
+        hidden_states = patch_encodings.hidden_states + image_encodings.hidden_states if output_attentions else None
+        attentions = patch_encodings.attentions + image_encodings.attentions if output_hidden_states else None
+
+        if not return_dict:
+            return tuple(v for v in [last_hidden_state, hidden_states, attentions] if v is not None)
+
+        return BaseModelOutput(
+            last_hidden_state=last_hidden_state,
+            hidden_states=hidden_states,
+            attentions=attentions,
+        )
+
+
+class DepthProFOVModel(nn.Module):
+    def __init__(self, config: DepthProConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.decoder_hidden_size = config.decoder_hidden_size
+
+        self.encoder = DepthProViT(config)
+        self.encoder_neck = nn.Linear(self.hidden_size, self.decoder_hidden_size // 2)
+        self.low_res_neck = nn.Sequential(
+            nn.Conv2d(self.decoder_hidden_size, self.decoder_hidden_size // 2, kernel_size=3, stride=2, padding=1),
+            nn.ReLU(True)
+        )
+        self.head = nn.Sequential(
+            nn.Conv2d(self.decoder_hidden_size // 2, self.decoder_hidden_size // 4, kernel_size=3, stride=2, padding=1), 
+            nn.ReLU(True),
+            nn.Conv2d(self.decoder_hidden_size // 4, self.decoder_hidden_size // 8, kernel_size=3, stride=2, padding=1),
+            nn.ReLU(True),
+            nn.Conv2d(self.decoder_hidden_size // 8, 1, kernel_size=6, stride=1, padding=0),
+        )
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        low_res_features: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        pixel_values = nn.functional.interpolate(
+            pixel_values,
+            size=None,
+            scale_factor=0.25,
+            mode="bilinear",
+            align_corners=False,
+        )
+        encoder_outputs = self.encoder(
+            pixel_values,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        image_features = encoder_outputs[0]
+
+        image_features = self.encoder_neck(image_features)
+
+        # TODO: add some comments
+        image_features = image_features[:, 1:]
+        image_features = image_features.permute(0, 2, 1)
+
+        low_res_features = self.low_res_neck(low_res_features)
+
+        image_features = image_features.reshape_as(low_res_features)
+        image_features = image_features + low_res_features
+        fov_output = self.head(image_features)
+        fov_output = fov_output.reshape(1)
+
+        if not return_dict:
+            head_outputs = (fov_output,)
+            return head_outputs + encoder_outputs[1:]
+
+        return BaseModelOutput(
+            last_hidden_state=fov_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+# Copied from transformers.models.dpt.modeling_dpt.DPTPreActResidualLayer DPTPreAct->DepthPro
+class DepthProResidualLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        self.use_batch_norm = config.use_batch_norm_in_decoder
+        self.hidden_size = config.decoder_hidden_size
+
+        self.activation1 = nn.ReLU()
+        self.convolution1 = nn.Conv2d(
+            self.hidden_size,
+            self.hidden_size,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=(not self.use_batch_norm),
+        )
+
+        self.activation2 = nn.ReLU()
+        self.convolution2 = nn.Conv2d(
+            self.hidden_size,
+            self.hidden_size,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=(not self.use_batch_norm),
+        )
+
+        if self.use_batch_norm:
+            self.batch_norm1 = nn.BatchNorm2d(self.hidden_size)
+            self.batch_norm2 = nn.BatchNorm2d(self.hidden_size)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        residual = hidden_state
+        hidden_state = self.activation1(hidden_state)
+
+        hidden_state = self.convolution1(hidden_state)
+
+        if self.use_batch_norm:
+            hidden_state = self.batch_norm1(hidden_state)
+
+        hidden_state = self.activation2(hidden_state)
+        hidden_state = self.convolution2(hidden_state)
+
+        if self.use_batch_norm:
+            hidden_state = self.batch_norm2(hidden_state)
+
+        return hidden_state + residual
+
+
+# Implementation resembles transformers.models.dpt.modeling_dpt.DPTFeatureFusionLayer
+class DepthProFeatureFusionLayer(nn.Module):
+    def __init__(self, config: DepthProConfig, use_deconv:bool=True) -> None:
+        super().__init__()
+        self.config = config
+        self.use_deconv = use_deconv
+
+        self.residual_layer1 = DepthProResidualLayer(config)
+        self.residual_layer2 = DepthProResidualLayer(config)
+
+        if self.use_deconv:
+            self.deconv = nn.ConvTranspose2d(
+                in_channels=config.decoder_hidden_size,
+                out_channels=config.decoder_hidden_size,
+                kernel_size=2,
+                stride=2,
+                padding=0,
+                bias=False,
+            )
+
+        self.projection = nn.Conv2d(config.decoder_hidden_size, config.decoder_hidden_size, kernel_size=1, bias=True)
+        self.skip_add = nn.quantized.FloatFunctional()
+
+    def forward(self, hidden_state, residual=None):
+        if residual is not None:
+            hidden_state = self.skip_add.add(hidden_state, self.residual_layer1(residual))
+
+        hidden_state = self.residual_layer2(hidden_state)
+        if self.use_deconv:
+            hidden_state = self.deconv(hidden_state)
+        hidden_state = self.projection(hidden_state)
+
+        return hidden_state
+
+
+# Implementation resembles transformers.models.dpt.modeling_dpt.DPTFeatureFusionStage
+class DepthProDecoder(nn.Module):
+    def __init__(self, config: DepthProConfig) -> None:
+        super().__init__()
+        self.config = config
+
+        self.hidden_size = config.decoder_hidden_size
+        self.decoder_feature_dims = [config.decoder_hidden_size] + config.patch_encoder_feature_dims
+
+        self.projections = nn.ModuleList()
+        self.fusions = nn.ModuleList()
+        for i, dim in enumerate(self.decoder_feature_dims):
+
+            # Projection
+            if i != 0:
+                # conv for hidden_states[1:]
+                projection = nn.Conv2d(
+                    in_channels=dim,
+                    out_channels=self.hidden_size,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=False,
+                )
+            elif self.hidden_size != dim:
+                # first hidden_state with dim differnet from hidden_size
+                projection = nn.Conv2d(
+                    in_channels=dim,
+                    out_channels=self.hidden_size,
+                    kernel_size=1,
+                    bias=False,
+                )
+            else:
+                # first hidden_state with dim same as hidden_size
+                projection = nn.Identity()
+            self.projections.append(projection)
+
+            # Fusion
+            fusion = DepthProFeatureFusionLayer(config, use_deconv=(i!=0))
+            self.fusions.append(fusion)
+
+    def forward(self, hidden_states):
+
+        if len(hidden_states) != len(self.decoder_feature_dims):
+            raise ValueError(
+                f"Got number of hidden_states = {len(hidden_states)},"
+                f"expected number of hidden_states = {len(self.decoder_feature_dims)}."
+            )
+
+        # first extract the low_res_features
+        last_features = hidden_states[-1]
+        last_features = self.projections[-1](last_features)
+        low_res_features = last_features # required later for fov_encoder
+        last_features = self.fusions[-1](last_features)
+
+        # now get features through each layer
+        for i in range(len(hidden_states) - 2, -1, -1):
+            hidden_state = hidden_states[i]
+            projection = self.projections[i]
+            fusion = self.fusions[i]
+
+            projected = projection(hidden_state)
+            last_features = fusion(last_features, projected)
+
+        return last_features, low_res_features
+
+
+class DepthProPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = DepthProConfig
+    base_model_prefix = "depth_pro"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["DepthProViTSwiGLUFFN"]
+    _supports_sdpa = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d, nn.ConvTranspose2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+DEPTH_PRO_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`DepthProConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+DEPTH_PRO_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`DPTImageProcessor.__call__`]
+            for details.
+
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare DepthPro Model transformer outputting raw hidden-states without any specific head on top.",
+    DEPTH_PRO_START_DOCSTRING,
+)
+class DepthProModel(DepthProPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.use_fov = config.use_fov
+
+        # dinov2 (vit) like encoder
+        self.encoder = DepthProEncoder(config)
+        # dpt (vit) like decoder
+        self.decoder = DepthProDecoder(config)
+        # dinov2 (vit) like encoder
+        self.fov_model = DepthProFOVModel(config) if self.use_fov else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        embeddings = {
+            "patch_embeddings": self.encoder.patch_encoder.embeddings.patch_embeddings,
+            "image_embeddings": self.encoder.image_encoder.embeddings.patch_embeddings,
+        }
+        if self.use_fov:
+            embeddings['fov_embeddings'] = self.fov_model.embeddings.patch_embeddings
+        return embeddings
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.patch_encoder.encoder.layer[layer].attention.prune_heads(heads)
+            self.encoder.image_encoder.encoder.layer[layer].attention.prune_heads(heads)
+            self.fov_model.encoder.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(DEPTH_PRO_INPUTS_DOCSTRING)
+    # TODO
+    # @add_code_sample_docstrings(
+    #     checkpoint=_CHECKPOINT_FOR_DOC,
+    #     output_type=BaseModelOutputWithPoolingAndIntermediateActivations,
+    #     config_class=_CONFIG_FOR_DOC,
+    #     modality="vision",
+    #     expected_output=_EXPECTED_OUTPUT_SHAPE,
+    # )
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        encodings = self.encoder(
+            pixel_values,
+            head_mask,
+            output_attentions,
+            output_hidden_states,
+            return_dict,
+        )
+
+        encodings_last_hidden_state = encodings.last_hidden_state
+
+        for i in range(len(encodings_last_hidden_state)):
+            ic(encodings_last_hidden_state[i].shape)
+
+        features, low_res_features = self.decoder(encodings_last_hidden_state)
+
+        ic(features.shape)
+        ic(low_res_features.shape)
+        # ic(features); exit()
+
+        if self.use_fov:
+            fov_out = self.fov_model(
+                pixel_values=pixel_values,
+                low_res_features=low_res_features.detach(),
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        else:
+            fov_out = None
+
+        return features, fov_out
+
+
+class DepthProDepthEstimationHead(nn.Module):
+    """
+    # TODO
+    Output head consisting of 3 convolutional layers. It progressively halves the feature dimension and upsamples
+    the predictions to the input resolution after the first convolutional layer (details can be found in the paper's
+    supplementary material).
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+
+        features = config.decoder_hidden_size
+        self.head = nn.Sequential(
+            nn.Conv2d(features, features//2, kernel_size=3, stride=1, padding=1),
+            nn.ConvTranspose2d(
+                in_channels=features//2, out_channels=features//2,
+                kernel_size=2, stride=2, padding=0, bias=True
+            ),
+            nn.Conv2d(features//2, 32, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(True),
+            nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
+            nn.ReLU(),
+        )
+
+
+    def forward(self, hidden_states: List[torch.Tensor]) -> torch.Tensor:
+        predicted_depth = self.head(hidden_states)
+        predicted_depth = predicted_depth.squeeze(dim=1)
+        return predicted_depth
+
+
+@add_start_docstrings(
+    """
+    DepthPro Model with a depth estimation head on top (consisting of 3 convolutional layers).
+    """,
+    DEPTH_PRO_START_DOCSTRING,
+)
+class DepthProForDepthEstimation(DepthProPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.depth_pro = DepthProModel(config)
+        self.head = DepthProDepthEstimationHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(DEPTH_PRO_INPUTS_DOCSTRING)
+    # @replace_return_docstrings(output_type=DepthEstimatorOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        head_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor]]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
+            Ground truth depth estimation maps for computing the loss.
+
+        Returns:
+
+        Examples:
+        ```python
+        >>> from transformers import AutoImageProcessor, DPTForDepthEstimation
+        >>> import torch
+        >>> import numpy as np
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("Intel/dpt-large")
+        >>> model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large")
+
+        >>> # prepare image for the model
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+        ...     predicted_depth = outputs.predicted_depth
+
+        >>> # interpolate to original size
+        >>> prediction = torch.nn.functional.interpolate(
+        ...     predicted_depth.unsqueeze(1),
+        ...     size=image.size[::-1],
+        ...     mode="bicubic",
+        ...     align_corners=False,
+        ... )
+
+        >>> # visualize the prediction
+        >>> output = prediction.squeeze().cpu().numpy()
+        >>> formatted = (output * 255 / np.max(output)).astype("uint8")
+        >>> depth = Image.fromarray(formatted)
+        ```"""
+        loss = None
+        if labels is not None:
+            raise NotImplementedError("Training is not implemented yet")
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        outputs = [None] * 4
+
+        hidden_states, fov_out = self.depth_pro(
+            pixel_values=pixel_values,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        predicted_depth = self.head(hidden_states)
+        ic(predicted_depth.shape)
+        ic(fov_out.shape)
+
+        # ic(predicted_depth); exit()
+        ic(fov_out); exit()
+
+        if not return_dict:
+            if output_hidden_states:
+                output = (predicted_depth,) + outputs[1:]
+            else:
+                output = (predicted_depth,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return DepthEstimatorOutput(
+            loss=loss,
+            predicted_depth=predicted_depth,
+            # hidden_states=outputs.hidden_states,
+            # attentions=outputs.attentions,
+        )

From 1728a2ff687435bc615a8c67d9a4f55baa6ff8d4 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sat, 9 Nov 2024 16:23:06 +0500
Subject: [PATCH 002/151] refactor model architechture

---
 .../depth_pro/configuration_depth_pro.py      |  19 +-
 .../models/depth_pro/modeling_depth_pro.py    | 478 ++++++++++--------
 2 files changed, 288 insertions(+), 209 deletions(-)

diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py
index ad0f1016f7a1..7e66e679c67f 100644
--- a/src/transformers/models/depth_pro/configuration_depth_pro.py
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@@ -129,9 +129,18 @@ def __init__(
         out_indices=None,
         apply_layernorm=True,
         reshape_hidden_states=True,
+        patch_encoder_feature_dims = [256, 512, 1024, 1024],
+
         patch_encoder_hook_ids = [5, 11],
         # patch_encoder_hook_ids = [5, 11, 17, 23],
-        patch_encoder_feature_dims = [256, 512, 1024, 1024],
+        intermediate_feature_dims = [256, 256],
+        intermediate_upsample_layers = [3, 2],
+        high_res_feature_dims = 512,
+        med_res_feature_dims = 1024,
+        low_res_feature_dims = 1024,
+        image_feature_dims = 1024,
+        global_feature_dims = 1024,
+
         use_batch_norm_in_decoder=False,
         use_fov=False,
         **kwargs,
@@ -165,3 +174,11 @@ def __init__(
         self.patch_encoder_feature_dims = patch_encoder_feature_dims
         self.use_batch_norm_in_decoder = use_batch_norm_in_decoder
         self.use_fov = use_fov
+
+        self.intermediate_feature_dims = intermediate_feature_dims
+        self.intermediate_upsample_layers = intermediate_upsample_layers
+        self.high_res_feature_dims = high_res_feature_dims
+        self.med_res_feature_dims = med_res_feature_dims
+        self.low_res_feature_dims = low_res_feature_dims
+        self.image_feature_dims = image_feature_dims
+        self.global_feature_dims = global_feature_dims
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index f73b74060f57..74669bc4e557 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -568,105 +568,112 @@ def forward(
         )
 
 
+class DepthProUpsampleBlock(nn.Module):
+    def __init__(
+            self,
+            input_dims,
+            intermediate_dims,
+            output_dims,
+            n_upsample_layers,
+            use_proj=True,
+            bias=False,
+        ) -> None:
+        super().__init__()
+
+        # create first projection block
+        if use_proj:
+            self.proj = nn.Conv2d(
+                in_channels=input_dims,
+                out_channels=intermediate_dims,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias=bias,
+            )
+        else:
+            self.proj = nn.Identity()
+
+        # create following upsample blocks
+        self.upsample_blocks = nn.Sequential()
+        for i in range(n_upsample_layers):
+            in_channels = intermediate_dims if i == 0 else output_dims
+            layer = nn.ConvTranspose2d(
+                in_channels=in_channels,
+                out_channels=output_dims,
+                kernel_size=2,
+                stride=2,
+                padding=0,
+                bias=bias,
+            )
+            self.upsample_blocks.append(layer)
+
+    def forward(self, features):
+        projected = self.proj(features)
+        return self.upsample_blocks(projected)
+
 class DepthProEncoder(nn.Module):
     def __init__(self, config: DepthProConfig) -> None:
         super().__init__()
         self.config = config
-
+        self.hidden_size = config.hidden_size
+        self.decoder_hidden_size = config.decoder_hidden_size
+        self.patch_encoder_hook_ids = config.patch_encoder_hook_ids
+        self.intermediate_feature_dims = config.intermediate_feature_dims
+        self.intermediate_upsample_layers = config.intermediate_upsample_layers
+ 
         self.out_size = 24 # TODO: image_size // patch_size
 
         # patch encoder
         self.patch_encoder = DepthProViT(config)
-        self.patch_encoder.encoder.layer[config.patch_encoder_hook_ids[0]].register_forward_hook(
-            self._intermediate0_hook
-        )
-        self.patch_encoder.encoder.layer[config.patch_encoder_hook_ids[1]].register_forward_hook(
-            self._intermediate1_hook
-        )
 
         # image encoder
         self.image_encoder = DepthProViT(config)
 
-        # upsampling features (1-2)
-        self.upsample_intermediate0 = self._create_project_upsample_block(
-            input_dims=config.hidden_size,
-            intermediate_dims=config.patch_encoder_feature_dims[0],
-            output_dims=config.decoder_hidden_size,
-            n_upsample_layers=3,
-        )
-        self.upsample_intermediate1 = self._create_project_upsample_block(
-            input_dims=config.hidden_size,
-            output_dims=config.patch_encoder_feature_dims[0],
-            n_upsample_layers=2,
-        )
+        # upsampling intermediate features - (1-2) in diagram
+        self.upsample_intermediate = nn.ModuleList()
+        for i, (feature_dims, upsample_layers) in enumerate(zip(
+            self.intermediate_feature_dims,
+            self.intermediate_upsample_layers,
+        )):
+            intermediate_dims = self.decoder_hidden_size if i == 0 else feature_dims
+            upsample_block = DepthProUpsampleBlock(
+                input_dims=config.hidden_size,
+                intermediate_dims=intermediate_dims,
+                output_dims=feature_dims,
+                n_upsample_layers=upsample_layers,
+            )
+            self.upsample_intermediate.append(upsample_block)
 
-        # upsampling features (3-5)
-        self.upsample_high_res = self._create_project_upsample_block(
+        # upsampling patch features (high_res, med_res, low_res) - (3-5) in diagram
+        self.upsample_high_res = DepthProUpsampleBlock(
             input_dims=config.hidden_size,
-            output_dims=config.patch_encoder_feature_dims[1],
+            intermediate_dims=config.high_res_feature_dims,
+            output_dims=config.high_res_feature_dims,
             n_upsample_layers=1,
         )
-        self.upsample_med_res = self._create_project_upsample_block(
+        self.upsample_med_res = DepthProUpsampleBlock(
             input_dims=config.hidden_size,
-            output_dims=config.patch_encoder_feature_dims[2],
+            intermediate_dims=config.med_res_feature_dims,
+            output_dims=config.med_res_feature_dims,
             n_upsample_layers=1,
         )
-        self.upsample_low_res = self._create_project_upsample_block(
+        self.upsample_low_res = DepthProUpsampleBlock(
             input_dims=config.hidden_size,
-            output_dims=config.patch_encoder_feature_dims[3],
+            intermediate_dims=config.low_res_feature_dims,
+            output_dims=config.low_res_feature_dims,
             n_upsample_layers=1,
         )
 
-        # upsampling features (6)
-        self.upsample_image = nn.ConvTranspose2d(
-            in_channels=config.hidden_size,
-            out_channels=config.patch_encoder_feature_dims[3],
-            kernel_size=2,
-            stride=2,
-            padding=0,
-            bias=True,
-        )
-        self.fuse_image_with_low_res = nn.Conv2d(
-            in_channels=(config.patch_encoder_feature_dims[3] + config.patch_encoder_feature_dims[3]),
-            out_channels=config.patch_encoder_feature_dims[3],
-            kernel_size=1,
-            stride=1,
-            padding=0,
+        # upsampling image features - (6) in diagram
+        self.upsample_image = DepthProUpsampleBlock(
+            input_dims=config.hidden_size,
+            intermediate_dims=config.hidden_size,
+            output_dims=config.image_feature_dims,
+            n_upsample_layers=1,
+            use_proj=False,
             bias=True,
         )
 
-    def _intermediate0_hook(self, model, input, output):
-        self.intermediate0_hidden_states = output[0]
-
-    def _intermediate1_hook(self, model, input, output):
-        self.intermediate1_hidden_states = output[0]
-
-    def _create_project_upsample_block(
-        self,
-        input_dims: int,
-        output_dims: int,
-        n_upsample_layers: int,
-        intermediate_dims: Optional[int] = None,
-    ) -> nn.Module:
-        
-        intermediate_dims = intermediate_dims or output_dims
-
-        # Projection block followed by upsampling blocks.
-        blocks = [
-            nn.Conv2d(input_dims, intermediate_dims, kernel_size=1, stride=1, padding=0, bias=False)
-        ] + [
-            nn.ConvTranspose2d(
-                in_channels=(intermediate_dims if i == 0 else output_dims),
-                out_channels=output_dims,
-                kernel_size=2,
-                stride=2,
-                padding=0,
-                bias=False
-            ) for i in range(n_upsample_layers)
-        ]
-
-        return nn.Sequential(*blocks)
-
     def _interpolate(self, pixel_values, scale_factor):
         return nn.functional.interpolate(
             pixel_values,
@@ -771,97 +778,100 @@ def forward(
             dim=0,
         )
 
-        # STEP 3: apply patch encoder
+        # STEP 3: apply patch and image encoder
 
         patch_encodings = self.patch_encoder(
             patches,
             head_mask=head_mask,
             output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
+            output_hidden_states=True, # required for intermediate features
             return_dict=True,
         )
-        patch_features = patch_encodings[0]
-        patch_features = self._reshape_feature(
-            patch_features, self.out_size, self.out_size
+        image_encodings = self.image_encoder(
+            pixel_values=low_res_patches,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
         )
 
-        # STEP 4: Get Intermediate Features (features 1 and 2)
-
-        intermediate0_features = self._reshape_feature(
-            self.intermediate0_hidden_states,
-            self.out_size,
-            self.out_size,
-        )
-        intermediate1_features = self._reshape_feature(
-            self.intermediate1_hidden_states,
-            self.out_size,
-            self.out_size,
-        )
-        intermediate0_features = self._merge(
-            intermediate0_features[: batch_size * 5 * 5], batch_size=batch_size, padding=3
-        )
-        intermediate1_features = self._merge(
-            intermediate1_features[: batch_size * 5 * 5], batch_size=batch_size, padding=3
-        )
+        # STEP 4: get patch features (high_res, med_res, low_res) - (3-5) in diagram
 
-        # STEP 5: Get Patch Encoder Features (features 3-5)
+        # a. extract hidden_state
+        hidden_state = patch_encodings.last_hidden_state
 
+        # b. reshape back to image like
+        features = self._reshape_feature(
+            hidden_state, self.out_size, self.out_size
+        )
         high_res_features, med_res_features, low_res_features = torch.split(
-            patch_features,
+            features,
             [len(high_res_patches), len(med_res_patches), len(low_res_patches)],
             dim=0,
         )
 
+        # c. merge patches back together
         high_res_features = self._merge(high_res_features, batch_size=batch_size, padding=3)
         med_res_features = self._merge(med_res_features, batch_size=batch_size, padding=6)
-        low_res_features = low_res_features
+        low_res_features = low_res_features # no merge required with low res image
 
-        # STEP 6: Get Image Encoder Features (features 6)
+        # d. upsample
+        high_res_features = self.upsample_high_res(high_res_features)
+        med_res_features = self.upsample_med_res(med_res_features)
+        low_res_features = self.upsample_low_res(low_res_features)
 
-        image_encodings = self.image_encoder(
-            pixel_values=low_res_patches,
-            head_mask=head_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=True,
-        )
-        image_features = image_encodings[0]
-        image_features = self._reshape_feature(
-            image_features, self.out_size, self.out_size
-        )
+        # STEP 5: get intermediate features - (1-2) in diagram
 
-        # STEP 7: Upsample All Features (feature 1-6)
+        intermediate_features = []
+        for layer_id in self.patch_encoder_hook_ids:
+            
+            # a. extract hidden_state
+            hidden_state = patch_encodings.hidden_states[layer_id+1] # +1 to correct index position as hidden_states contain embedding output as well
 
-        # feature (1-2)
-        intermediate0_features = self.upsample_intermediate0(
-            intermediate0_features
-        )
-        intermediate1_features = self.upsample_intermediate1(
-            intermediate1_features
+            # b. reshape back to image like
+            features = self._reshape_feature(
+                hidden_state,
+                self.out_size,
+                self.out_size,
+            )
+
+            # c. merge patches back together
+            features = self._merge(
+                features[: batch_size * 5 * 5], batch_size=batch_size, padding=3
+            )
+
+            # d. upsample
+            features = self.upsample_intermediate[layer_id](features)
+
+            intermediate_features.append(features)
+
+        # STEP 6: get image features - (6) in diagram
+
+        # a. extract hidden_state
+        hidden_state = image_encodings.last_hidden_state
+
+        # b. reshape back to image like
+        image_features = self._reshape_feature(
+            hidden_state, self.out_size, self.out_size
         )
 
-        # feature (3-5)
-        high_res_features = self.upsample_high_res(high_res_features)
-        med_res_features = self.upsample_med_res(med_res_features)
-        low_res_features = self.upsample_low_res(low_res_features)
+        # c. merge patches back together
+        # skipped, no merge required with low res image
 
-        # feature (6)
+        # d. upsample
         image_features = self.upsample_image(image_features)
-        image_features = self.fuse_image_with_low_res(
-            torch.cat((low_res_features, image_features), dim=1)
-        )
 
+        # STEP 7: return these features
         last_hidden_state =  [
-            intermediate0_features,
-            intermediate1_features,
+            *intermediate_features,
             high_res_features,
             med_res_features,
-            # low_res_features,
-            image_features, # fused with low_res_features
+            low_res_features,
+            image_features,
         ]
 
-        hidden_states = patch_encodings.hidden_states + image_encodings.hidden_states if output_attentions else None
-        attentions = patch_encodings.attentions + image_encodings.attentions if output_hidden_states else None
+        hidden_states = patch_encodings.hidden_states + image_encodings.hidden_states if output_hidden_states else None
+        attentions = patch_encodings.attentions + image_encodings.attentions if output_attentions else None
 
         if not return_dict:
             return tuple(v for v in [last_hidden_state, hidden_states, attentions] if v is not None)
@@ -882,7 +892,7 @@ def __init__(self, config: DepthProConfig) -> None:
 
         self.encoder = DepthProViT(config)
         self.encoder_neck = nn.Linear(self.hidden_size, self.decoder_hidden_size // 2)
-        self.low_res_neck = nn.Sequential(
+        self.global_neck = nn.Sequential(
             nn.Conv2d(self.decoder_hidden_size, self.decoder_hidden_size // 2, kernel_size=3, stride=2, padding=1),
             nn.ReLU(True)
         )
@@ -897,7 +907,7 @@ def __init__(self, config: DepthProConfig) -> None:
     def forward(
         self,
         pixel_values: torch.Tensor,
-        low_res_features: torch.Tensor,
+        global_features: torch.Tensor,
         head_mask: Optional[torch.Tensor] = None,
         output_attentions: bool = False,
         output_hidden_states: bool = False,
@@ -923,19 +933,19 @@ def forward(
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
         )
-        image_features = encoder_outputs[0]
+        last_hidden_state = encoder_outputs[0]
 
-        image_features = self.encoder_neck(image_features)
+        last_hidden_state = self.encoder_neck(last_hidden_state)
 
         # TODO: add some comments
-        image_features = image_features[:, 1:]
-        image_features = image_features.permute(0, 2, 1)
+        last_hidden_state = last_hidden_state[:, 1:]
+        last_hidden_state = last_hidden_state.permute(0, 2, 1)
 
-        low_res_features = self.low_res_neck(low_res_features)
+        global_features = self.global_neck(global_features)
 
-        image_features = image_features.reshape_as(low_res_features)
-        image_features = image_features + low_res_features
-        fov_output = self.head(image_features)
+        last_hidden_state = last_hidden_state.reshape_as(global_features)
+        last_hidden_state = last_hidden_state + global_features
+        fov_output = self.head(last_hidden_state)
         fov_output = fov_output.reshape(1)
 
         if not return_dict:
@@ -1040,65 +1050,126 @@ def __init__(self, config: DepthProConfig) -> None:
         super().__init__()
         self.config = config
 
-        self.hidden_size = config.decoder_hidden_size
-        self.decoder_feature_dims = [config.decoder_hidden_size] + config.patch_encoder_feature_dims
-
-        self.projections = nn.ModuleList()
-        self.fusions = nn.ModuleList()
-        for i, dim in enumerate(self.decoder_feature_dims):
-
-            # Projection
-            if i != 0:
-                # conv for hidden_states[1:]
-                projection = nn.Conv2d(
-                    in_channels=dim,
-                    out_channels=self.hidden_size,
+        # for STEP 2: fuse low_res and image features
+        self.fuse_image_with_low_res = nn.Conv2d(
+            in_channels=config.low_res_feature_dims+config.image_feature_dims,
+            out_channels=config.global_feature_dims,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True,
+        )
+
+        # for STEP 3: apply decoder block for global features
+        self.global_proj = nn.Conv2d(
+            in_channels=config.global_feature_dims,
+            out_channels=config.decoder_hidden_size,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=False,
+        )
+        self.global_fusion = DepthProFeatureFusionLayer(config)
+
+        # for STEP 4: apply decoder block for med features
+        self.med_res_proj = nn.Conv2d(
+            in_channels=config.med_res_feature_dims,
+            out_channels=config.decoder_hidden_size,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=False,
+        )
+        self.med_res_fusion = DepthProFeatureFusionLayer(config)
+
+        # for STEP 5: apply decoder block for high features
+        self.high_res_proj = nn.Conv2d(
+            in_channels=config.high_res_feature_dims,
+            out_channels=config.decoder_hidden_size,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=False,
+        )
+        self.high_res_fusion = DepthProFeatureFusionLayer(config)
+
+        # for STEP 6: apply decoder block for intermediate features
+        self.intermediate_proj = nn.Sequential()
+        self.intermediate_fusion = nn.Sequential()
+        for i, feature_dim in enumerate(config.intermediate_feature_dims):
+            if i == 0:
+                # no projection for final intermediate layer
+                proj = nn.Identity()
+                fusion = DepthProFeatureFusionLayer(config, use_deconv=False)
+            else:
+                proj = nn.Conv2d(
+                    in_channels=feature_dim,
+                    out_channels=config.decoder_hidden_size,
                     kernel_size=3,
                     stride=1,
                     padding=1,
                     bias=False,
                 )
-            elif self.hidden_size != dim:
-                # first hidden_state with dim differnet from hidden_size
-                projection = nn.Conv2d(
-                    in_channels=dim,
-                    out_channels=self.hidden_size,
-                    kernel_size=1,
-                    bias=False,
-                )
-            else:
-                # first hidden_state with dim same as hidden_size
-                projection = nn.Identity()
-            self.projections.append(projection)
+                fusion = DepthProFeatureFusionLayer(config)
 
-            # Fusion
-            fusion = DepthProFeatureFusionLayer(config, use_deconv=(i!=0))
-            self.fusions.append(fusion)
+            self.intermediate_proj.append(proj)
+            self.intermediate_fusion.append(fusion)
 
     def forward(self, hidden_states):
 
-        if len(hidden_states) != len(self.decoder_feature_dims):
-            raise ValueError(
-                f"Got number of hidden_states = {len(hidden_states)},"
-                f"expected number of hidden_states = {len(self.decoder_feature_dims)}."
-            )
+        # STEP 1: extract features
 
-        # first extract the low_res_features
-        last_features = hidden_states[-1]
-        last_features = self.projections[-1](last_features)
-        low_res_features = last_features # required later for fov_encoder
-        last_features = self.fusions[-1](last_features)
+        intermediate_features = hidden_states[:-4]
+        # intermediate_features_i.shape: [batch_size, config.intermediate_feature_dims_i, 768, 768], [1, 256, 384, 384]
+        high_res_features = hidden_states[-4]
+        # high_res_features.shape: [batch_size, config.high_res_feature_dims, 192, 192]
+        med_res_features = hidden_states[-3]
+        # med_res_features.shape: [batch_size, config.med_res_feature_dims, 96, 96]
+        low_res_features = hidden_states[-2]
+        # low_res_features.shape: [batch_size, config.low_res_feature_dims, 48, 48]
+        image_features = hidden_states[-1]
+        # image_features.shape: [batch_size, config.image_feature_dims, 48, 48]
 
-        # now get features through each layer
-        for i in range(len(hidden_states) - 2, -1, -1):
-            hidden_state = hidden_states[i]
-            projection = self.projections[i]
-            fusion = self.fusions[i]
+        # STEP 2: fuse low_res and image features
 
-            projected = projection(hidden_state)
-            last_features = fusion(last_features, projected)
+        global_features = torch.cat((low_res_features, image_features), dim=1)
+        global_features = self.fuse_image_with_low_res(global_features)
+        # global_features.shape: [batch_size, config.global_feature_dims, 48, 48]
 
-        return last_features, low_res_features
+        # STEP 3: apply decoder block for global features
+
+        # apply projection: used by fusion now and then fov later
+        global_projected = self.global_proj(global_features)
+        # apply fusion: used by next projections and fusions
+        last_features = self.global_fusion(global_projected)
+        # last_features.shape: [batch_size, config.decoder_hidden_size, 96, 96]
+
+        # STEP 4: apply decoder block for med features
+
+        projected = self.med_res_proj(med_res_features)
+        last_features = self.med_res_fusion(last_features, projected)
+        # last_features.shape: [batch_size, config.decoder_hidden_size, 192, 192]
+
+        # STEP 5: apply decoder block for high features
+
+        projected = self.high_res_proj(high_res_features)
+        last_features = self.high_res_fusion(last_features, projected)
+        # last_features.shape: [batch_size, config.decoder_hidden_size, 384, 384]
+
+        # STEP 6: apply decoder block for intermediate features
+
+        for (features, proj_layer, fusion_layer) in zip(
+            # reversed becuase decoding is applied from last features to first features
+            intermediate_features[::-1],
+            self.intermediate_proj[::-1],
+            self.intermediate_fusion[::-1],
+        ):
+            projected = proj_layer(features)
+            last_features = fusion_layer(last_features, projected)
+            # last_features.shape: [batch_size, config.decoder_hidden_size, 768, 768]
+            # last_features.shape: [batch_size, config.decoder_hidden_size, 768, 768]
+
+        return last_features, global_projected
 
 
 class DepthProPreTrainedModel(PreTrainedModel):
@@ -1233,26 +1304,18 @@ def forward(
         encodings = self.encoder(
             pixel_values,
             head_mask,
-            output_attentions,
-            output_hidden_states,
-            return_dict,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
         )
 
-        encodings_last_hidden_state = encodings.last_hidden_state
-
-        for i in range(len(encodings_last_hidden_state)):
-            ic(encodings_last_hidden_state[i].shape)
-
-        features, low_res_features = self.decoder(encodings_last_hidden_state)
-
-        ic(features.shape)
-        ic(low_res_features.shape)
-        # ic(features); exit()
+        last_hidden_state = encodings[0]
+        last_hidden_state, global_features = self.decoder(last_hidden_state)
 
         if self.use_fov:
             fov_out = self.fov_model(
                 pixel_values=pixel_values,
-                low_res_features=low_res_features.detach(),
+                global_features=global_features.detach(),
                 head_mask=head_mask,
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
@@ -1261,7 +1324,8 @@ def forward(
         else:
             fov_out = None
 
-        return features, fov_out
+        # TODO: return all hidden_states
+        return last_hidden_state, fov_out
 
 
 class DepthProDepthEstimationHead(nn.Module):
@@ -1375,18 +1439,16 @@ def forward(
 
         outputs = [None] * 4
 
-        hidden_states, fov_out = self.depth_pro(
+        last_hidden_state, fov_out = self.depth_pro(
             pixel_values=pixel_values,
             head_mask=head_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
         )
-        predicted_depth = self.head(hidden_states)
-        ic(predicted_depth.shape)
-        ic(fov_out.shape)
+        predicted_depth = self.head(last_hidden_state)
 
-        # ic(predicted_depth); exit()
+        ic(predicted_depth)
         ic(fov_out); exit()
 
         if not return_dict:

From 11ce50c5cf2c87839909da806b1a9dc1665c11f2 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Tue, 12 Nov 2024 10:49:46 +0500
Subject: [PATCH 003/151] update model outputs

---
 .../models/depth_pro/modeling_depth_pro.py    | 77 ++++++++++++++-----
 1 file changed, 56 insertions(+), 21 deletions(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 74669bc4e557..daa2bbbdd64b 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -24,9 +24,10 @@
 from torch import nn
 from dataclasses import dataclass
 
+from ...utils import ModelOutput
 from ...activations import ACT2FN
 from ...modeling_outputs import (
-    BaseModelOutput,
+    BaseModelOutput, DepthEstimatorOutput
 )
 from ...utils import (
     add_code_sample_docstrings,
@@ -1232,6 +1233,18 @@ def _init_weights(self, module):
 """
 
 
+@dataclass
+class DepthProModelOutput(BaseModelOutput):
+    """
+    Base class for model's outputs, with potential fov, hidden states and attentions.
+
+    Args:
+        fov (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `use_fov` is provided):
+            Field of View Scaler.
+    """
+    fov: Optional[torch.FloatTensor] = None
+
+
 @add_start_docstrings(
     "The bare DepthPro Model transformer outputting raw hidden-states without any specific head on top.",
     DEPTH_PRO_START_DOCSTRING,
@@ -1306,14 +1319,14 @@ def forward(
             head_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
+            return_dict=True,
         )
 
-        last_hidden_state = encodings[0]
+        last_hidden_state = encodings.last_hidden_state
         last_hidden_state, global_features = self.decoder(last_hidden_state)
 
         if self.use_fov:
-            fov_out = self.fov_model(
+            fov_encodings = self.fov_model(
                 pixel_values=pixel_values,
                 global_features=global_features.detach(),
                 head_mask=head_mask,
@@ -1321,11 +1334,24 @@ def forward(
                 output_hidden_states=output_hidden_states,
                 return_dict=return_dict,
             )
+            fov = fov_encodings.last_hidden_state
         else:
-            fov_out = None
+            fov = None
+
+        attentions = encodings.attentions + fov_encodings.attentions if output_attentions else None
+        hidden_states = encodings.hidden_states + fov_encodings.hidden_states if output_hidden_states else None
+
+        if not return_dict:
+            outputs = (last_hidden_state, fov, hidden_states, attentions)
+            outputs = (i for i in outputs if i is not None)
+            return outputs
 
-        # TODO: return all hidden_states
-        return last_hidden_state, fov_out
+        return DepthProModelOutput(
+            last_hidden_state=last_hidden_state,
+            fov=fov,
+            hidden_states=hidden_states,
+            attentions=attentions,
+        )
 
 
 class DepthProDepthEstimationHead(nn.Module):
@@ -1360,6 +1386,18 @@ def forward(self, hidden_states: List[torch.Tensor]) -> torch.Tensor:
         return predicted_depth
 
 
+@dataclass
+class DepthProDepthEstimatorOutput(DepthEstimatorOutput):
+    """
+    Base class for outputs of DepthProDepthEstimator.
+
+    Args:
+        fov (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `use_fov` is provided):
+            Field of View Scaler.
+    """
+    fov: Optional[torch.FloatTensor] = None
+
+
 @add_start_docstrings(
     """
     DepthPro Model with a depth estimation head on top (consisting of 3 convolutional layers).
@@ -1436,31 +1474,28 @@ def forward(
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        # use_fov = use_fov if use_fov is not None else self.config.use_fov
 
-        outputs = [None] * 4
-
-        last_hidden_state, fov_out = self.depth_pro(
+        depth_pro_outputs = self.depth_pro(
             pixel_values=pixel_values,
             head_mask=head_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
+            return_dict=True,
         )
+        last_hidden_state = depth_pro_outputs[0]
         predicted_depth = self.head(last_hidden_state)
 
-        ic(predicted_depth)
-        ic(fov_out); exit()
-
         if not return_dict:
-            if output_hidden_states:
-                output = (predicted_depth,) + outputs[1:]
+            if loss is None:
+                return (predicted_depth,) + depth_pro_outputs[1:]
             else:
-                output = (predicted_depth,) + outputs[2:]
-            return ((loss,) + output) if loss is not None else output
+                return (loss, predicted_depth) + depth_pro_outputs[1:]
 
-        return DepthEstimatorOutput(
+        return DepthProDepthEstimatorOutput(
             loss=loss,
             predicted_depth=predicted_depth,
-            # hidden_states=outputs.hidden_states,
-            # attentions=outputs.attentions,
+            fov=depth_pro_outputs.fov,
+            hidden_states=depth_pro_outputs.hidden_states,
+            attentions=depth_pro_outputs.attentions,
         )

From 27e9593ada48c5c17a3a96e67bff534e022359ad Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sat, 16 Nov 2024 10:23:03 +0500
Subject: [PATCH 004/151] update init param to include use_fov_model

---
 .../models/depth_pro/modeling_depth_pro.py     | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index daa2bbbdd64b..f8b69bfec86e 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -1239,7 +1239,7 @@ class DepthProModelOutput(BaseModelOutput):
     Base class for model's outputs, with potential fov, hidden states and attentions.
 
     Args:
-        fov (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `use_fov` is provided):
+        fov (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `use_fov_model` is provided):
             Field of View Scaler.
     """
     fov: Optional[torch.FloatTensor] = None
@@ -1250,17 +1250,17 @@ class DepthProModelOutput(BaseModelOutput):
     DEPTH_PRO_START_DOCSTRING,
 )
 class DepthProModel(DepthProPreTrainedModel):
-    def __init__(self, config):
+    def __init__(self, config, use_fov_model=None):
         super().__init__(config)
         self.config = config
-        self.use_fov = config.use_fov
+        self.use_fov_model = use_fov_model if use_fov_model is not None else self.config.use_fov_model
 
         # dinov2 (vit) like encoder
         self.encoder = DepthProEncoder(config)
         # dpt (vit) like decoder
         self.decoder = DepthProDecoder(config)
         # dinov2 (vit) like encoder
-        self.fov_model = DepthProFOVModel(config) if self.use_fov else None
+        self.fov_model = DepthProFOVModel(config) if self.use_fov_model else None
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1325,7 +1325,7 @@ def forward(
         last_hidden_state = encodings.last_hidden_state
         last_hidden_state, global_features = self.decoder(last_hidden_state)
 
-        if self.use_fov:
+        if self.use_fov_model:
             fov_encodings = self.fov_model(
                 pixel_values=pixel_values,
                 global_features=global_features.detach(),
@@ -1392,7 +1392,7 @@ class DepthProDepthEstimatorOutput(DepthEstimatorOutput):
     Base class for outputs of DepthProDepthEstimator.
 
     Args:
-        fov (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `use_fov` is provided):
+        fov (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `use_fov_model` is provided):
             Field of View Scaler.
     """
     fov: Optional[torch.FloatTensor] = None
@@ -1405,10 +1405,11 @@ class DepthProDepthEstimatorOutput(DepthEstimatorOutput):
     DEPTH_PRO_START_DOCSTRING,
 )
 class DepthProForDepthEstimation(DepthProPreTrainedModel):
-    def __init__(self, config):
+    def __init__(self, config, use_fov_model=None):
         super().__init__(config)
+        self.use_fov_model = use_fov_model if use_fov_model is not None else self.config.use_fov_model
 
-        self.depth_pro = DepthProModel(config)
+        self.depth_pro = DepthProModel(config, use_fov_model=self.use_fov_model)
         self.head = DepthProDepthEstimationHead(config)
 
         # Initialize weights and apply final processing
@@ -1474,7 +1475,6 @@ def forward(
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        # use_fov = use_fov if use_fov is not None else self.config.use_fov
 
         depth_pro_outputs = self.depth_pro(
             pixel_values=pixel_values,

From e74a7f505f91a24117e7838e367b72a50ff9e8f1 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sat, 16 Nov 2024 10:24:21 +0500
Subject: [PATCH 005/151] update param name in config

---
 src/transformers/models/depth_pro/configuration_depth_pro.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py
index 7e66e679c67f..a4037c99ee0f 100644
--- a/src/transformers/models/depth_pro/configuration_depth_pro.py
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@@ -142,7 +142,7 @@ def __init__(
         global_feature_dims = 1024,
 
         use_batch_norm_in_decoder=False,
-        use_fov=False,
+        use_fov_model=False,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -173,7 +173,7 @@ def __init__(
         self.patch_encoder_hook_ids = patch_encoder_hook_ids
         self.patch_encoder_feature_dims = patch_encoder_feature_dims
         self.use_batch_norm_in_decoder = use_batch_norm_in_decoder
-        self.use_fov = use_fov
+        self.use_fov_model = use_fov_model
 
         self.intermediate_feature_dims = intermediate_feature_dims
         self.intermediate_upsample_layers = intermediate_upsample_layers

From 8c2460b0655dd3ef698b765eb64c79cc785c7d10 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sat, 16 Nov 2024 10:51:56 +0500
Subject: [PATCH 006/151] fix hidden_states and attentions outputs for fov

---
 src/transformers/models/depth_pro/modeling_depth_pro.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index f8b69bfec86e..620133771c06 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -1332,14 +1332,15 @@ def forward(
                 head_mask=head_mask,
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
+                return_dict=True,
             )
             fov = fov_encodings.last_hidden_state
+            attentions = encodings.attentions + fov_encodings.attentions if output_attentions else None
+            hidden_states = encodings.hidden_states + fov_encodings.hidden_states if output_hidden_states else None
         else:
             fov = None
-
-        attentions = encodings.attentions + fov_encodings.attentions if output_attentions else None
-        hidden_states = encodings.hidden_states + fov_encodings.hidden_states if output_hidden_states else None
+            attentions = encodings.attentions
+            hidden_states = encodings.hidden_states
 
         if not return_dict:
             outputs = (last_hidden_state, fov, hidden_states, attentions)

From 55f6ed3439cef2a731b8b78cba3b6142e3125447 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sat, 16 Nov 2024 11:20:56 +0500
Subject: [PATCH 007/151] sort config

---
 .../models/depth_pro/configuration_depth_pro.py             | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py
index a4037c99ee0f..16ff55e9cb6c 100644
--- a/src/transformers/models/depth_pro/configuration_depth_pro.py
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@@ -129,10 +129,7 @@ def __init__(
         out_indices=None,
         apply_layernorm=True,
         reshape_hidden_states=True,
-        patch_encoder_feature_dims = [256, 512, 1024, 1024],
-
         patch_encoder_hook_ids = [5, 11],
-        # patch_encoder_hook_ids = [5, 11, 17, 23],
         intermediate_feature_dims = [256, 256],
         intermediate_upsample_layers = [3, 2],
         high_res_feature_dims = 512,
@@ -140,7 +137,6 @@ def __init__(
         low_res_feature_dims = 1024,
         image_feature_dims = 1024,
         global_feature_dims = 1024,
-
         use_batch_norm_in_decoder=False,
         use_fov_model=False,
         **kwargs,
@@ -171,10 +167,8 @@ def __init__(
         self.apply_layernorm = apply_layernorm
         self.reshape_hidden_states = reshape_hidden_states
         self.patch_encoder_hook_ids = patch_encoder_hook_ids
-        self.patch_encoder_feature_dims = patch_encoder_feature_dims
         self.use_batch_norm_in_decoder = use_batch_norm_in_decoder
         self.use_fov_model = use_fov_model
-
         self.intermediate_feature_dims = intermediate_feature_dims
         self.intermediate_upsample_layers = intermediate_upsample_layers
         self.high_res_feature_dims = high_res_feature_dims

From b25dffb5d7f0aef86bb7c2dac990c24b28dafb5a Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sat, 16 Nov 2024 11:21:13 +0500
Subject: [PATCH 008/151] complete minor todos

---
 .../models/depth_pro/modeling_depth_pro.py            | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 620133771c06..956fe7afb7f7 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -938,8 +938,7 @@ def forward(
 
         last_hidden_state = self.encoder_neck(last_hidden_state)
 
-        # TODO: add some comments
-        last_hidden_state = last_hidden_state[:, 1:]
+        last_hidden_state = last_hidden_state[:, 1:] # ignore cls_token
         last_hidden_state = last_hidden_state.permute(0, 2, 1)
 
         global_features = self.global_neck(global_features)
@@ -1357,10 +1356,10 @@ def forward(
 
 class DepthProDepthEstimationHead(nn.Module):
     """
-    # TODO
-    Output head consisting of 3 convolutional layers. It progressively halves the feature dimension and upsamples
-    the predictions to the input resolution after the first convolutional layer (details can be found in the paper's
-    supplementary material).
+    The DepthProDepthEstimationHead module serves as the output head for depth estimation tasks.
+    This module comprises a sequence of convolutional and transposed convolutional layers
+    that process the feature map from the decoder to produce a single-channel depth map.
+    Key operations include dimensionality reduction and upsampling to match the input resolution.
     """
 
     def __init__(self, config):

From c225deb0d126a8420ccb5e381fa2e120abedabf0 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sat, 16 Nov 2024 13:22:15 +0500
Subject: [PATCH 009/151] update patching

---
 .../models/depth_pro/modeling_depth_pro.py    | 36 ++++++++++---------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 956fe7afb7f7..59b6d46e30ca 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -685,23 +685,25 @@ def _interpolate(self, pixel_values, scale_factor):
         )
 
     def _patch(self, pixel_values, overlap_ratio):
-        patch_size = 384 # TODO: this should be infered
-        patch_stride = int(patch_size * (1 - overlap_ratio))
+        B, C, H, W = pixel_values.shape
 
-        image_size = pixel_values.shape[-1]
-        steps = int(math.ceil((image_size - patch_size) / patch_stride)) + 1
+        patch_size = 384  # TODO: this should be inferred
+        stride = int(patch_size * (1 - overlap_ratio))
 
-        x_patch_list = []
-        for j in range(steps):
-            j0 = j * patch_stride
-            j1 = j0 + patch_size
+        if pixel_values.dim() != 4:
+            raise ValueError("Input tensor must have shape (B, C, H, W).")
 
-            for i in range(steps):
-                i0 = i * patch_stride
-                i1 = i0 + patch_size
-                x_patch_list.append(pixel_values[..., j0:j1, i0:i1])
+        # pixel_values.shape (B, C, H, W)
+        patches = torch.nn.functional.unfold(
+            pixel_values, kernel_size=(patch_size, patch_size), stride=(stride, stride)
+        )
+        # patches.shape (B, -1, num_patches)
+        patches = patches.permute(2, 0, 1)
+        # patches.shape (num_patches, B, -1)
+        patches = patches.reshape(-1, C, patch_size, patch_size)
+        # patches.shape (B * num_patches, C, patch_size, patch_size)
 
-        return torch.cat(x_patch_list, dim=0)
+        return patches
 
     def _reshape_feature(
         self, hidden_states: torch.Tensor, width, height, cls_token_offset=1
@@ -760,7 +762,7 @@ def forward(
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        batch_size = pixel_values.shape[0]
+        B, C, H, W = pixel_values.shape
 
         # STEP 1: create 3-level image
 
@@ -812,8 +814,8 @@ def forward(
         )
 
         # c. merge patches back together
-        high_res_features = self._merge(high_res_features, batch_size=batch_size, padding=3)
-        med_res_features = self._merge(med_res_features, batch_size=batch_size, padding=6)
+        high_res_features = self._merge(high_res_features, batch_size=B, padding=3)
+        med_res_features = self._merge(med_res_features, batch_size=B, padding=6)
         low_res_features = low_res_features # no merge required with low res image
 
         # d. upsample
@@ -838,7 +840,7 @@ def forward(
 
             # c. merge patches back together
             features = self._merge(
-                features[: batch_size * 5 * 5], batch_size=batch_size, padding=3
+                features[: B * 5 * 5], batch_size=B, padding=3
             )
 
             # d. upsample

From 176932dc6aba7bfaf541bee756fc493f541434dd Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sat, 16 Nov 2024 16:35:43 +0500
Subject: [PATCH 010/151] update config for encoder

---
 .../depth_pro/configuration_depth_pro.py      |  14 ++-
 .../models/depth_pro/modeling_depth_pro.py    | 108 ++++++++++--------
 2 files changed, 71 insertions(+), 51 deletions(-)

diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py
index 16ff55e9cb6c..cdf3cf4d8d70 100644
--- a/src/transformers/models/depth_pro/configuration_depth_pro.py
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@@ -119,7 +119,7 @@ def __init__(
         initializer_range=0.02,
         layer_norm_eps=1e-6,
         image_size=384,
-        patch_size=16, # changed
+        patch_size=16, # TODO remove this
         num_channels=3,
         qkv_bias=True,
         layerscale_value=1.0,
@@ -139,6 +139,13 @@ def __init__(
         global_feature_dims = 1024,
         use_batch_norm_in_decoder=False,
         use_fov_model=False,
+
+        # aux_image_size=1536,
+        # aux_patch_size=384,
+        aux_image_size=1536 // 2,
+        aux_patch_size=384 // 2,
+        aux_num_channels=3,
+        patch_embeddings_size=16,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -176,3 +183,8 @@ def __init__(
         self.low_res_feature_dims = low_res_feature_dims
         self.image_feature_dims = image_feature_dims
         self.global_feature_dims = global_feature_dims
+
+        self.aux_image_size = aux_image_size
+        self.aux_patch_size = aux_patch_size
+        self.aux_num_channels = aux_num_channels
+        self.patch_embeddings_size = patch_embeddings_size
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 59b6d46e30ca..3d3d356cc0ee 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -55,22 +55,22 @@ class DepthProViTPatchEmbeddings(nn.Module):
 
     def __init__(self, config):
         super().__init__()
-        image_size, patch_size = config.image_size, config.patch_size
-        num_channels, hidden_size = config.num_channels, config.hidden_size
 
-        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
-        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.num_patches = num_patches
-
-        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
+        self.config = config
+        self.in_channels = config.aux_num_channels
+        self.out_channels = config.hidden_size
+        self.patch_embeddings_size = config.patch_embeddings_size
+
+        self.projection = nn.Conv2d(
+            self.in_channels,
+            self.out_channels,
+            kernel_size=(self.patch_embeddings_size, self.patch_embeddings_size),
+            stride=(self.patch_embeddings_size, self.patch_embeddings_size),
+        )
 
     def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
         num_channels = pixel_values.shape[1]
-        if num_channels != self.num_channels:
+        if num_channels != self.config.aux_num_channels:
             raise ValueError(
                 "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
                 f" Expected {self.num_channels} but got {num_channels}."
@@ -89,10 +89,12 @@ class DepthProViTEmbeddings(nn.Module):
     def __init__(self, config: DepthProConfig) -> None:
         super().__init__()
 
+        self.config = config
+        self.seq_len = (config.aux_patch_size // config.patch_embeddings_size) ** 2
+
         self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size))
         self.patch_embeddings = DepthProViTPatchEmbeddings(config)
-        num_patches = self.patch_embeddings.num_patches
-        self.position_embeddings = nn.Parameter(torch.randn(1, num_patches + 1, config.hidden_size))
+        self.position_embeddings = nn.Parameter(torch.randn(1, self.seq_len + 1, config.hidden_size))
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.patch_size = config.patch_size
         self.config = config
@@ -107,11 +109,10 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width:
         - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
         """
 
-        num_patches = embeddings.shape[1] - 1
         num_positions = self.position_embeddings.shape[1] - 1
 
         # always interpolate when tracing to ensure the exported model works for dynamic input shapes
-        if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
+        if not torch.jit.is_tracing() and self.seq_len == num_positions and height == width:
             return self.position_embeddings
 
         class_pos_embed = self.position_embeddings[:, :1]
@@ -119,8 +120,8 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width:
 
         dim = embeddings.shape[-1]
 
-        new_height = height // self.patch_size
-        new_width = width // self.patch_size
+        new_height = height // self.patch_size # TODO: check this
+        new_width = width // self.patch_size # TODO: check this
 
         sqrt_num_positions = torch_int(num_positions**0.5)
         patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
@@ -621,8 +622,9 @@ def __init__(self, config: DepthProConfig) -> None:
         self.patch_encoder_hook_ids = config.patch_encoder_hook_ids
         self.intermediate_feature_dims = config.intermediate_feature_dims
         self.intermediate_upsample_layers = config.intermediate_upsample_layers
- 
-        self.out_size = 24 # TODO: image_size // patch_size
+
+        self.out_size = config.aux_patch_size // config.patch_embeddings_size
+        self.seq_len = self.out_size ** 2
 
         # patch encoder
         self.patch_encoder = DepthProViT(config)
@@ -685,23 +687,18 @@ def _interpolate(self, pixel_values, scale_factor):
         )
 
     def _patch(self, pixel_values, overlap_ratio):
-        B, C, H, W = pixel_values.shape
-
-        patch_size = 384  # TODO: this should be inferred
+        patch_size = self.config.aux_patch_size
         stride = int(patch_size * (1 - overlap_ratio))
 
-        if pixel_values.dim() != 4:
-            raise ValueError("Input tensor must have shape (B, C, H, W).")
-
-        # pixel_values.shape (B, C, H, W)
+        # pixel_values.shape (B, config.aux_num_channels, config.aux_image_size, config.aux_image_size)
         patches = torch.nn.functional.unfold(
             pixel_values, kernel_size=(patch_size, patch_size), stride=(stride, stride)
         )
         # patches.shape (B, -1, num_patches)
         patches = patches.permute(2, 0, 1)
         # patches.shape (num_patches, B, -1)
-        patches = patches.reshape(-1, C, patch_size, patch_size)
-        # patches.shape (B * num_patches, C, patch_size, patch_size)
+        patches = patches.reshape(-1, self.config.aux_num_channels, patch_size, patch_size)
+        # patches.shape (B * num_patches, config.aux_num_channels, config.aux_patch_size, config.aux_patch_size)
 
         return patches
 
@@ -762,24 +759,33 @@ def forward(
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
+        if pixel_values.dim() != 4:
+            raise ValueError("Input tensor must have shape (B, C, H, W).")
+
         B, C, H, W = pixel_values.shape
 
+        # TODO validate: H = W = aux_image_size
+        # TODO validate: C = aux_num_channels
+        # TODO validate: aux_image_size = aux_patch_size * 4
+
+        # pixel_values.shape (B, config.aux_num_channels, config.aux_image_size, config.aux_image_size)
+
         # STEP 1: create 3-level image
 
-        high_res = pixel_values
-        med_res = self._interpolate(pixel_values, 0.5)
-        low_res = self._interpolate(pixel_values, 0.25)
+        high_res = pixel_values                         # (B, config.aux_num_channels, config.aux_image_size, config.aux_image_size)
+        med_res = self._interpolate(pixel_values, 0.5)  # (B, config.aux_num_channels, config.aux_image_size//2, config.aux_image_size//2)
+        low_res = self._interpolate(pixel_values, 0.25) # (B, config.aux_num_channels, config.aux_image_size//4, config.aux_image_size//4)
 
         # STEP 2: create patches
 
-        high_res_patches = self._patch(high_res, 0.25)
-        med_res_patches = self._patch(med_res, 0.5)
-        low_res_patches = low_res
+        high_res_patches = self._patch(high_res, 0.25)  # (-1, config.aux_num_channels, config.aux_patch_size, config.aux_patch_size)
+        med_res_patches = self._patch(med_res, 0.5)     # (-1, config.aux_num_channels, config.aux_patch_size, config.aux_patch_size)
+        low_res_patches = low_res                       # (-1, config.aux_num_channels, config.aux_patch_size, config.aux_patch_size)
 
         patches = torch.cat(
             (high_res_patches, med_res_patches, low_res_patches),
             dim=0,
-        )
+        ) # (num_patches, config.aux_num_channels, config.aux_patch_size, config.aux_patch_size)
 
         # STEP 3: apply patch and image encoder
 
@@ -801,42 +807,43 @@ def forward(
         # STEP 4: get patch features (high_res, med_res, low_res) - (3-5) in diagram
 
         # a. extract hidden_state
-        hidden_state = patch_encodings.last_hidden_state
+        hidden_state = patch_encodings.last_hidden_state # (num_patches, self.seq_len+1, config.hidden_size)
 
         # b. reshape back to image like
         features = self._reshape_feature(
             hidden_state, self.out_size, self.out_size
-        )
+        ) # (num_patches, config.aux_num_channels, self.out_size, self.out_size)
         high_res_features, med_res_features, low_res_features = torch.split(
             features,
             [len(high_res_patches), len(med_res_patches), len(low_res_patches)],
             dim=0,
-        )
+        ) # (num_patches, config.aux_num_channels, self.out_size, self.out_size)
 
         # c. merge patches back together
-        high_res_features = self._merge(high_res_features, batch_size=B, padding=3)
-        med_res_features = self._merge(med_res_features, batch_size=B, padding=6)
-        low_res_features = low_res_features # no merge required with low res image
+        high_res_features = self._merge(high_res_features, batch_size=B, padding=3) # (B, config.hidden_size, ~, ~)
+        med_res_features = self._merge(med_res_features, batch_size=B, padding=6)   # (B, config.hidden_size, ~, ~)
+        low_res_features = low_res_features # no merge required with low res image  # (B, config.hidden_size, ~, ~)
 
         # d. upsample
-        high_res_features = self.upsample_high_res(high_res_features)
-        med_res_features = self.upsample_med_res(med_res_features)
-        low_res_features = self.upsample_low_res(low_res_features)
+        high_res_features = self.upsample_high_res(high_res_features)   # (B, config.high_res_feature_dims, ~, ~)
+        med_res_features = self.upsample_med_res(med_res_features)      # (B, config.med_res_feature_dims, ~, ~)
+        low_res_features = self.upsample_low_res(low_res_features)      # (B, config.low_res_feature_dims, ~, ~)
 
         # STEP 5: get intermediate features - (1-2) in diagram
 
         intermediate_features = []
         for layer_id in self.patch_encoder_hook_ids:
-            
+
             # a. extract hidden_state
             hidden_state = patch_encodings.hidden_states[layer_id+1] # +1 to correct index position as hidden_states contain embedding output as well
+            # (num_patches, self.seq_len+1, config.hidden_size)
 
             # b. reshape back to image like
             features = self._reshape_feature(
                 hidden_state,
                 self.out_size,
                 self.out_size,
-            )
+            ) # (num_patches, config.hidden_size, self.out_size, self.out_size)
 
             # c. merge patches back together
             features = self._merge(
@@ -845,24 +852,25 @@ def forward(
 
             # d. upsample
             features = self.upsample_intermediate[layer_id](features)
+            # (B, config.intermediate_feature_dims[layer_id], ~, ~)
 
             intermediate_features.append(features)
 
         # STEP 6: get image features - (6) in diagram
 
         # a. extract hidden_state
-        hidden_state = image_encodings.last_hidden_state
+        hidden_state = image_encodings.last_hidden_state # (num_patches, self.seq_len+1, config.hidden_size)
 
         # b. reshape back to image like
         image_features = self._reshape_feature(
             hidden_state, self.out_size, self.out_size
-        )
+        ) # (num_patches, config.hidden_size, self.out_size, self.out_size)
 
         # c. merge patches back together
         # skipped, no merge required with low res image
 
         # d. upsample
-        image_features = self.upsample_image(image_features)
+        image_features = self.upsample_image(image_features) # (B, config.image_feature_dims, ~, ~)
 
         # STEP 7: return these features
         last_hidden_state =  [

From dcec5228b21352f6638c27c91f1d4056323eba95 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sat, 16 Nov 2024 16:46:17 +0500
Subject: [PATCH 011/151] fix config

---
 .../depth_pro/configuration_depth_pro.py      | 20 +++-----
 .../models/depth_pro/modeling_depth_pro.py    | 48 +++++++++----------
 2 files changed, 29 insertions(+), 39 deletions(-)

diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py
index cdf3cf4d8d70..fc12b37b19d0 100644
--- a/src/transformers/models/depth_pro/configuration_depth_pro.py
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@@ -118,9 +118,12 @@ def __init__(
         attention_probs_dropout_prob=0.0,
         initializer_range=0.02,
         layer_norm_eps=1e-6,
-        image_size=384,
-        patch_size=16, # TODO remove this
+        # image_size=1536,
+        # patch_size=384,
+        image_size=1536 // 2,
+        patch_size=384 // 2,
         num_channels=3,
+        patch_embeddings_size=16,
         qkv_bias=True,
         layerscale_value=1.0,
         drop_path_rate=0.0,
@@ -139,13 +142,6 @@ def __init__(
         global_feature_dims = 1024,
         use_batch_norm_in_decoder=False,
         use_fov_model=False,
-
-        # aux_image_size=1536,
-        # aux_patch_size=384,
-        aux_image_size=1536 // 2,
-        aux_patch_size=384 // 2,
-        aux_num_channels=3,
-        patch_embeddings_size=16,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -163,6 +159,7 @@ def __init__(
         self.image_size = image_size
         self.patch_size = patch_size
         self.num_channels = num_channels
+        self.patch_embeddings_size = patch_embeddings_size
         self.qkv_bias = qkv_bias
         self.layerscale_value = layerscale_value
         self.drop_path_rate = drop_path_rate
@@ -183,8 +180,3 @@ def __init__(
         self.low_res_feature_dims = low_res_feature_dims
         self.image_feature_dims = image_feature_dims
         self.global_feature_dims = global_feature_dims
-
-        self.aux_image_size = aux_image_size
-        self.aux_patch_size = aux_patch_size
-        self.aux_num_channels = aux_num_channels
-        self.patch_embeddings_size = patch_embeddings_size
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 3d3d356cc0ee..d56391313979 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -57,7 +57,7 @@ def __init__(self, config):
         super().__init__()
 
         self.config = config
-        self.in_channels = config.aux_num_channels
+        self.in_channels = config.num_channels
         self.out_channels = config.hidden_size
         self.patch_embeddings_size = config.patch_embeddings_size
 
@@ -70,7 +70,7 @@ def __init__(self, config):
 
     def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
         num_channels = pixel_values.shape[1]
-        if num_channels != self.config.aux_num_channels:
+        if num_channels != self.config.num_channels:
             raise ValueError(
                 "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
                 f" Expected {self.num_channels} but got {num_channels}."
@@ -90,14 +90,12 @@ def __init__(self, config: DepthProConfig) -> None:
         super().__init__()
 
         self.config = config
-        self.seq_len = (config.aux_patch_size // config.patch_embeddings_size) ** 2
+        self.seq_len = (config.patch_size // config.patch_embeddings_size) ** 2
 
         self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size))
         self.patch_embeddings = DepthProViTPatchEmbeddings(config)
         self.position_embeddings = nn.Parameter(torch.randn(1, self.seq_len + 1, config.hidden_size))
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.patch_size = config.patch_size
-        self.config = config
 
     def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
         """
@@ -120,8 +118,8 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width:
 
         dim = embeddings.shape[-1]
 
-        new_height = height // self.patch_size # TODO: check this
-        new_width = width // self.patch_size # TODO: check this
+        new_height = height // self.config.patch_embeddings_size
+        new_width = width // self.config.patch_embeddings_size
 
         sqrt_num_positions = torch_int(num_positions**0.5)
         patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
@@ -623,7 +621,7 @@ def __init__(self, config: DepthProConfig) -> None:
         self.intermediate_feature_dims = config.intermediate_feature_dims
         self.intermediate_upsample_layers = config.intermediate_upsample_layers
 
-        self.out_size = config.aux_patch_size // config.patch_embeddings_size
+        self.out_size = config.patch_size // config.patch_embeddings_size
         self.seq_len = self.out_size ** 2
 
         # patch encoder
@@ -687,18 +685,18 @@ def _interpolate(self, pixel_values, scale_factor):
         )
 
     def _patch(self, pixel_values, overlap_ratio):
-        patch_size = self.config.aux_patch_size
+        patch_size = self.config.patch_size
         stride = int(patch_size * (1 - overlap_ratio))
 
-        # pixel_values.shape (B, config.aux_num_channels, config.aux_image_size, config.aux_image_size)
+        # pixel_values.shape (B, config.num_channels, config.image_size, config.image_size)
         patches = torch.nn.functional.unfold(
             pixel_values, kernel_size=(patch_size, patch_size), stride=(stride, stride)
         )
         # patches.shape (B, -1, num_patches)
         patches = patches.permute(2, 0, 1)
         # patches.shape (num_patches, B, -1)
-        patches = patches.reshape(-1, self.config.aux_num_channels, patch_size, patch_size)
-        # patches.shape (B * num_patches, config.aux_num_channels, config.aux_patch_size, config.aux_patch_size)
+        patches = patches.reshape(-1, self.config.num_channels, patch_size, patch_size)
+        # patches.shape (B * num_patches, config.num_channels, config.patch_size, config.patch_size)
 
         return patches
 
@@ -764,28 +762,28 @@ def forward(
 
         B, C, H, W = pixel_values.shape
 
-        # TODO validate: H = W = aux_image_size
-        # TODO validate: C = aux_num_channels
-        # TODO validate: aux_image_size = aux_patch_size * 4
+        # TODO validate: H = W = image_size
+        # TODO validate: C = num_channels
+        # TODO validate: image_size = patch_size * 4
 
-        # pixel_values.shape (B, config.aux_num_channels, config.aux_image_size, config.aux_image_size)
+        # pixel_values.shape (B, config.num_channels, config.image_size, config.image_size)
 
         # STEP 1: create 3-level image
 
-        high_res = pixel_values                         # (B, config.aux_num_channels, config.aux_image_size, config.aux_image_size)
-        med_res = self._interpolate(pixel_values, 0.5)  # (B, config.aux_num_channels, config.aux_image_size//2, config.aux_image_size//2)
-        low_res = self._interpolate(pixel_values, 0.25) # (B, config.aux_num_channels, config.aux_image_size//4, config.aux_image_size//4)
+        high_res = pixel_values                         # (B, config.num_channels, config.image_size, config.image_size)
+        med_res = self._interpolate(pixel_values, 0.5)  # (B, config.num_channels, config.image_size//2, config.image_size//2)
+        low_res = self._interpolate(pixel_values, 0.25) # (B, config.num_channels, config.image_size//4, config.image_size//4)
 
         # STEP 2: create patches
 
-        high_res_patches = self._patch(high_res, 0.25)  # (-1, config.aux_num_channels, config.aux_patch_size, config.aux_patch_size)
-        med_res_patches = self._patch(med_res, 0.5)     # (-1, config.aux_num_channels, config.aux_patch_size, config.aux_patch_size)
-        low_res_patches = low_res                       # (-1, config.aux_num_channels, config.aux_patch_size, config.aux_patch_size)
+        high_res_patches = self._patch(high_res, 0.25)  # (-1, config.num_channels, config.patch_size, config.patch_size)
+        med_res_patches = self._patch(med_res, 0.5)     # (-1, config.num_channels, config.patch_size, config.patch_size)
+        low_res_patches = low_res                       # (-1, config.num_channels, config.patch_size, config.patch_size)
 
         patches = torch.cat(
             (high_res_patches, med_res_patches, low_res_patches),
             dim=0,
-        ) # (num_patches, config.aux_num_channels, config.aux_patch_size, config.aux_patch_size)
+        ) # (num_patches, config.num_channels, config.patch_size, config.patch_size)
 
         # STEP 3: apply patch and image encoder
 
@@ -812,12 +810,12 @@ def forward(
         # b. reshape back to image like
         features = self._reshape_feature(
             hidden_state, self.out_size, self.out_size
-        ) # (num_patches, config.aux_num_channels, self.out_size, self.out_size)
+        ) # (num_patches, config.num_channels, self.out_size, self.out_size)
         high_res_features, med_res_features, low_res_features = torch.split(
             features,
             [len(high_res_patches), len(med_res_patches), len(low_res_patches)],
             dim=0,
-        ) # (num_patches, config.aux_num_channels, self.out_size, self.out_size)
+        ) # (num_patches, config.num_channels, self.out_size, self.out_size)
 
         # c. merge patches back together
         high_res_features = self._merge(high_res_features, batch_size=B, padding=3) # (B, config.hidden_size, ~, ~)

From 0384d2f189062259b3b99a3d692593e28902ec0b Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sat, 16 Nov 2024 19:37:00 +0500
Subject: [PATCH 012/151] use correct defaults in config

---
 .../models/depth_pro/configuration_depth_pro.py             | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py
index fc12b37b19d0..aff3eb3e2941 100644
--- a/src/transformers/models/depth_pro/configuration_depth_pro.py
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@@ -118,10 +118,8 @@ def __init__(
         attention_probs_dropout_prob=0.0,
         initializer_range=0.02,
         layer_norm_eps=1e-6,
-        # image_size=1536,
-        # patch_size=384,
-        image_size=1536 // 2,
-        patch_size=384 // 2,
+        image_size=1536,
+        patch_size=384,
         num_channels=3,
         patch_embeddings_size=16,
         qkv_bias=True,

From 85e4f868b65fa5b208883cb973824ca6e2557db8 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sun, 17 Nov 2024 23:47:50 +0500
Subject: [PATCH 013/151] update merge for compatibility with different image
 size

---
 .../depth_pro/configuration_depth_pro.py      |   6 +-
 .../models/depth_pro/modeling_depth_pro.py    | 135 +++++++++++-------
 2 files changed, 88 insertions(+), 53 deletions(-)

diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py
index aff3eb3e2941..d9f973639ad0 100644
--- a/src/transformers/models/depth_pro/configuration_depth_pro.py
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@@ -108,9 +108,9 @@ class DepthProConfig(PretrainedConfig):
 
     def __init__(
         self,
-        hidden_size=1024, # changed
+        hidden_size=1024,
         decoder_hidden_size=256,
-        num_hidden_layers=24, # changed
+        num_hidden_layers=24,
         num_attention_heads=16,
         mlp_ratio=4,
         hidden_act="gelu",
@@ -132,7 +132,6 @@ def __init__(
         reshape_hidden_states=True,
         patch_encoder_hook_ids = [5, 11],
         intermediate_feature_dims = [256, 256],
-        intermediate_upsample_layers = [3, 2],
         high_res_feature_dims = 512,
         med_res_feature_dims = 1024,
         low_res_feature_dims = 1024,
@@ -172,7 +171,6 @@ def __init__(
         self.use_batch_norm_in_decoder = use_batch_norm_in_decoder
         self.use_fov_model = use_fov_model
         self.intermediate_feature_dims = intermediate_feature_dims
-        self.intermediate_upsample_layers = intermediate_upsample_layers
         self.high_res_feature_dims = high_res_feature_dims
         self.med_res_feature_dims = med_res_feature_dims
         self.low_res_feature_dims = low_res_feature_dims
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index d56391313979..316afe444fbb 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -619,7 +619,6 @@ def __init__(self, config: DepthProConfig) -> None:
         self.decoder_hidden_size = config.decoder_hidden_size
         self.patch_encoder_hook_ids = config.patch_encoder_hook_ids
         self.intermediate_feature_dims = config.intermediate_feature_dims
-        self.intermediate_upsample_layers = config.intermediate_upsample_layers
 
         self.out_size = config.patch_size // config.patch_embeddings_size
         self.seq_len = self.out_size ** 2
@@ -632,17 +631,15 @@ def __init__(self, config: DepthProConfig) -> None:
 
         # upsampling intermediate features - (1-2) in diagram
         self.upsample_intermediate = nn.ModuleList()
-        for i, (feature_dims, upsample_layers) in enumerate(zip(
-            self.intermediate_feature_dims,
-            self.intermediate_upsample_layers,
-        )):
+        for i, feature_dims in enumerate(self.intermediate_feature_dims):
             intermediate_dims = self.decoder_hidden_size if i == 0 else feature_dims
             upsample_block = DepthProUpsampleBlock(
                 input_dims=config.hidden_size,
                 intermediate_dims=intermediate_dims,
                 output_dims=feature_dims,
-                n_upsample_layers=upsample_layers,
+                n_upsample_layers=1+len(self.intermediate_feature_dims)-i,
             )
+
             self.upsample_intermediate.append(upsample_block)
 
         # upsampling patch features (high_res, med_res, low_res) - (3-5) in diagram
@@ -714,34 +711,46 @@ def _reshape_feature(
         hidden_states = hidden_states.reshape(b, height, width, c).permute(0, 3, 1, 2)
         return hidden_states
 
-    def _merge(self, x: torch.Tensor, batch_size: int, padding: int = 3) -> torch.Tensor:
+    def _merge(self, x: torch.Tensor, batch_size: int, merge_out_size: int) -> torch.Tensor:
         """Merge the patched input into a image with sliding window."""
-        steps = int(math.sqrt(x.shape[0] // batch_size))
-
-        idx = 0
-
-        output_list = []
-        for j in range(steps):
-            output_row_list = []
-            for i in range(steps):
-                output = x[batch_size * idx : batch_size * (idx + 1)]
+        # x.shape (num_patches, config.num_channels, self.out_size, self.out_size)
+        box_size = int(math.sqrt(x.shape[0] // batch_size))
 
-                if j != 0:
-                    output = output[..., padding:, :]
-                if i != 0:
-                    output = output[..., :, padding:]
-                if j != steps - 1:
-                    output = output[..., :-padding, :]
-                if i != steps - 1:
-                    output = output[..., :, :-padding]
-
-                output_row_list.append(output)
-                idx += 1
-
-            output_row = torch.cat(output_row_list, dim=-1)
-            output_list.append(output_row)
-        output = torch.cat(output_list, dim=-2)
-        return output
+        """
+        merge_out_size = (box_size - 2) * (out_size - 2 * padding) + (2) * (out_size - padding)
+        padding = (merge_out_size - box_size * out_size) / (6 - 2 * box_size)
+        """
+        padding = ( box_size * self.out_size - merge_out_size ) // ( 2 * box_size - 2 )
+
+        i = 0
+        boxes = []
+        for h in range(box_size):
+            boxes_in_row = []
+            for w in range(box_size):
+                box = x[batch_size * i : batch_size * (i + 1)]
+
+                if h != 0:
+                    # remove pad from height if box is not at top border
+                    box = box[..., padding:, :]
+                if w != 0:
+                    # remove pad from width if box is not at left border
+                    box = box[..., :, padding:]
+                if h != box_size - 1:
+                    # remove pad from height if box is not at bottom border
+                    box = box[..., :box.shape[-2]-padding, :]
+                if w != box_size - 1:
+                    # remove pad from width if box is not at right border
+                    box = box[..., :, :box.shape[-1]-padding]
+
+                boxes_in_row.append(box)
+                i += 1
+
+            boxes_in_row = torch.cat(boxes_in_row, dim=-1)
+            boxes.append(boxes_in_row)
+
+        boxes = torch.cat(boxes, dim=-2)
+        boxes = boxes[..., :merge_out_size, :merge_out_size]
+        return boxes
 
     def forward(
         self,
@@ -818,19 +827,19 @@ def forward(
         ) # (num_patches, config.num_channels, self.out_size, self.out_size)
 
         # c. merge patches back together
-        high_res_features = self._merge(high_res_features, batch_size=B, padding=3) # (B, config.hidden_size, ~, ~)
-        med_res_features = self._merge(med_res_features, batch_size=B, padding=6)   # (B, config.hidden_size, ~, ~)
-        low_res_features = low_res_features # no merge required with low res image  # (B, config.hidden_size, ~, ~)
+        high_res_features = self._merge(high_res_features, batch_size=B, merge_out_size=self.out_size*4) # (B, config.hidden_size, self.out_size*2**2, self.out_size*2**2)
+        med_res_features = self._merge(med_res_features, batch_size=B, merge_out_size=self.out_size*2)   # (B, config.hidden_size, self.out_size*2**1, self.out_size*2**1)
+        low_res_features = low_res_features # no merge required with low res image  # (B, config.hidden_size, self.out_size*2**0, self.out_size*2**0)
 
         # d. upsample
-        high_res_features = self.upsample_high_res(high_res_features)   # (B, config.high_res_feature_dims, ~, ~)
-        med_res_features = self.upsample_med_res(med_res_features)      # (B, config.med_res_feature_dims, ~, ~)
-        low_res_features = self.upsample_low_res(low_res_features)      # (B, config.low_res_feature_dims, ~, ~)
+        high_res_features = self.upsample_high_res(high_res_features)   # (B, config.high_res_feature_dims, self.out_size*2**3, self.out_size*2**3)
+        med_res_features = self.upsample_med_res(med_res_features)      # (B, config.med_res_feature_dims, self.out_size*2**2, self.out_size*2**2)
+        low_res_features = self.upsample_low_res(low_res_features)      # (B, config.low_res_feature_dims, self.out_size*2**1, self.out_size*2**1)
 
         # STEP 5: get intermediate features - (1-2) in diagram
 
         intermediate_features = []
-        for layer_id in self.patch_encoder_hook_ids:
+        for i, layer_id in enumerate(self.patch_encoder_hook_ids):
 
             # a. extract hidden_state
             hidden_state = patch_encodings.hidden_states[layer_id+1] # +1 to correct index position as hidden_states contain embedding output as well
@@ -845,12 +854,12 @@ def forward(
 
             # c. merge patches back together
             features = self._merge(
-                features[: B * 5 * 5], batch_size=B, padding=3
-            )
+                features[: B * 5 * 5], batch_size=B, merge_out_size=self.out_size*4,
+            ) # (B, config.hidden_size, self.out_size*2**2, self.out_size*2**2)
 
             # d. upsample
             features = self.upsample_intermediate[layer_id](features)
-            # (B, config.intermediate_feature_dims[layer_id], ~, ~)
+            # (B, config.intermediate_feature_dims[i], self.out_size*2**(3+total-i), self.out_size*2**(3+total-i))
 
             intermediate_features.append(features)
 
@@ -868,16 +877,25 @@ def forward(
         # skipped, no merge required with low res image
 
         # d. upsample
-        image_features = self.upsample_image(image_features) # (B, config.image_feature_dims, ~, ~)
+        image_features = self.upsample_image(image_features) # (B, config.image_feature_dims, self.out_size*2**1, self.out_size*2**1)
 
         # STEP 7: return these features
         last_hidden_state =  [
-            *intermediate_features,
-            high_res_features,
-            med_res_features,
-            low_res_features,
-            image_features,
+            *intermediate_features, # (B, config.image_feature_dims, self.out_size*2**3+total-i, self.out_size*2**3+total-i)
+            high_res_features,      # (B, config.image_feature_dims, self.out_size*2**3, self.out_size*2**3)
+            med_res_features,       # (B, config.image_feature_dims, self.out_size*2**2, self.out_size*2**2)
+            low_res_features,       # (B, config.image_feature_dims, self.out_size*2**1, self.out_size*2**1)
+            image_features,         # (B, config.image_feature_dims, self.out_size*2**1, self.out_size*2**1)
         ]
+        # for i in last_hidden_state:
+            # ic(i.shape)
+        # exit()
+
+        #  768, 384, 192,  96, 48, 48 - image_size=1536
+        #  384, 192,  96,  48, 24, 24 - image_size=768 (ideal)
+        #  288, 144,  72,  24, 24, 24 - image_size=768 (practical)
+        # 1536, 768, 384, 192, 96, 96 - image_size=3072 (ideal)
+        # 1728, 864, 432, 240, 96, 96 - image_size=3072 (practical)
 
         hidden_states = patch_encodings.hidden_states + image_encodings.hidden_states if output_hidden_states else None
         attentions = patch_encodings.attentions + image_encodings.attentions if output_attentions else None
@@ -951,6 +969,11 @@ def forward(
 
         global_features = self.global_neck(global_features)
 
+        ic(last_hidden_state.shape)
+        ic(global_features.shape)
+
+        # exit()
+
         last_hidden_state = last_hidden_state.reshape_as(global_features)
         last_hidden_state = last_hidden_state + global_features
         fov_output = self.head(last_hidden_state)
@@ -1107,7 +1130,15 @@ def __init__(self, config: DepthProConfig) -> None:
         for i, feature_dim in enumerate(config.intermediate_feature_dims):
             if i == 0:
                 # no projection for final intermediate layer
-                proj = nn.Identity()
+                if feature_dim == config.decoder_hidden_size:
+                    proj = nn.Identity()
+                else:
+                    proj = nn.Conv2d(
+                        in_channels=feature_dim,
+                        out_channels=config.decoder_hidden_size,
+                        kernel_size=1,
+                        bias=False,
+                    )
                 fusion = DepthProFeatureFusionLayer(config, use_deconv=False)
             else:
                 proj = nn.Conv2d(
@@ -1124,6 +1155,10 @@ def __init__(self, config: DepthProConfig) -> None:
             self.intermediate_fusion.append(fusion)
 
     def forward(self, hidden_states):
+        ic("Start of Decoder")
+
+        for i in hidden_states:
+            ic(i.shape)
 
         # STEP 1: extract features
 
@@ -1492,7 +1527,9 @@ def forward(
             return_dict=True,
         )
         last_hidden_state = depth_pro_outputs[0]
+        ic(last_hidden_state.shape)
         predicted_depth = self.head(last_hidden_state)
+        ic(predicted_depth.shape)
 
         if not return_dict:
             if loss is None:

From 00e4aa3b7bb04324cd08f2f87a2a34f4033fccca Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Thu, 21 Nov 2024 11:04:58 +0500
Subject: [PATCH 014/151] restructure encoder for custom configuration

---
 .../depth_pro/configuration_depth_pro.py      |  21 +-
 .../models/depth_pro/modeling_depth_pro.py    | 842 ++++++++----------
 2 files changed, 395 insertions(+), 468 deletions(-)

diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py
index d9f973639ad0..055830900417 100644
--- a/src/transformers/models/depth_pro/configuration_depth_pro.py
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@@ -59,6 +59,7 @@ class DepthProConfig(PretrainedConfig):
         layer_norm_eps (`float`, *optional*, defaults to 1e-06):
             The epsilon used by the layer normalization layers.
         image_size (`int`, *optional*, defaults to 224):
+            TODO: image_size / 2**n_decoder_blocks = patch_size / patch_embeddings_size
             The size (resolution) of each image.
         patch_size (`int`, *optional*, defaults to 14):
             The size (resolution) of each patch.
@@ -130,13 +131,11 @@ def __init__(
         out_indices=None,
         apply_layernorm=True,
         reshape_hidden_states=True,
-        patch_encoder_hook_ids = [5, 11],
+        intermediate_hook_ids = [11, 5],
         intermediate_feature_dims = [256, 256],
-        high_res_feature_dims = 512,
-        med_res_feature_dims = 1024,
-        low_res_feature_dims = 1024,
-        image_feature_dims = 1024,
-        global_feature_dims = 1024,
+        scaled_images_ratios = [0.25, 0.5, 1],
+        scaled_images_overlap_ratios = [0.0, 0.5, 0.25],
+        scaled_images_feature_dims = [1024, 1024, 512],
         use_batch_norm_in_decoder=False,
         use_fov_model=False,
         **kwargs,
@@ -167,12 +166,10 @@ def __init__(
         )
         self.apply_layernorm = apply_layernorm
         self.reshape_hidden_states = reshape_hidden_states
-        self.patch_encoder_hook_ids = patch_encoder_hook_ids
         self.use_batch_norm_in_decoder = use_batch_norm_in_decoder
         self.use_fov_model = use_fov_model
+        self.intermediate_hook_ids = intermediate_hook_ids
         self.intermediate_feature_dims = intermediate_feature_dims
-        self.high_res_feature_dims = high_res_feature_dims
-        self.med_res_feature_dims = med_res_feature_dims
-        self.low_res_feature_dims = low_res_feature_dims
-        self.image_feature_dims = image_feature_dims
-        self.global_feature_dims = global_feature_dims
+        self.scaled_images_ratios = scaled_images_ratios
+        self.scaled_images_overlap_ratios = scaled_images_overlap_ratios
+        self.scaled_images_feature_dims = scaled_images_feature_dims
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 316afe444fbb..9f146177402c 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -226,7 +226,6 @@ def forward(
         self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
     ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
         if output_attentions:
-            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
             logger.warning_once(
                 "DepthProModel is using DepthProViTSdpaSelfAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
                 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
@@ -617,11 +616,40 @@ def __init__(self, config: DepthProConfig) -> None:
         self.config = config
         self.hidden_size = config.hidden_size
         self.decoder_hidden_size = config.decoder_hidden_size
-        self.patch_encoder_hook_ids = config.patch_encoder_hook_ids
+
+        self.intermediate_hook_ids = config.intermediate_hook_ids
         self.intermediate_feature_dims = config.intermediate_feature_dims
+        self.scaled_images_ratios = config.scaled_images_ratios
+        self.scaled_images_overlap_ratios = config.scaled_images_overlap_ratios
+        self.scaled_images_feature_dims = config.scaled_images_feature_dims
 
+        self.n_scaled_images = len(self.scaled_images_ratios)
+        self.n_intermediate_hooks = len(self.intermediate_hook_ids)
         self.out_size = config.patch_size // config.patch_embeddings_size
-        self.seq_len = self.out_size ** 2
+        self.seq_len = self.out_size ** 2 # each patch is flattened
+
+        # config.scaled_images_ratios is sorted
+        if config.scaled_images_ratios != sorted(config.scaled_images_ratios):
+            raise ValueError(
+                f"Values in scaled_images_ratios={config.scaled_images_ratios} "
+                "should be sorted from low to high"
+            )
+
+        # lowest image resolution is greator than the patch_size
+        if config.scaled_images_ratios[0] * config.image_size < config.patch_size:
+            raise ValueError(
+                "Image cannot be scaled to a size less than patch_size. "
+                f"Provide values in scaled_images_ratios={config.scaled_images_ratios} suitable "
+                f"to the given patch_size={config.patch_size}."
+            )
+
+        # patch_size should be a divisible by patch_embeddings_size
+        # else it raises an exception in DepthProViTPatchEmbeddings
+        if config.patch_size % config.patch_embeddings_size != 0:
+            raise ValueError(
+                f"patch_size={config.patch_size} should be divisible "
+                f"by patch_embeddings_size={config.patch_embeddings_size}."
+            )
 
         # patch encoder
         self.patch_encoder = DepthProViT(config)
@@ -629,6 +657,17 @@ def __init__(self, config: DepthProConfig) -> None:
         # image encoder
         self.image_encoder = DepthProViT(config)
 
+        # upsampling patch features (high_res, med_res, low_res) - (3-5) in diagram
+        self.upsample_scaled_images = nn.ModuleList()
+        for i, feature_dims in enumerate(self.scaled_images_feature_dims):
+            upsample_block = DepthProUpsampleBlock(
+                input_dims=config.hidden_size,
+                intermediate_dims=feature_dims,
+                output_dims=feature_dims,
+                n_upsample_layers=1,
+            )
+            self.upsample_scaled_images.append(upsample_block)
+
         # upsampling intermediate features - (1-2) in diagram
         self.upsample_intermediate = nn.ModuleList()
         for i, feature_dims in enumerate(self.intermediate_feature_dims):
@@ -637,42 +676,33 @@ def __init__(self, config: DepthProConfig) -> None:
                 input_dims=config.hidden_size,
                 intermediate_dims=intermediate_dims,
                 output_dims=feature_dims,
-                n_upsample_layers=1+len(self.intermediate_feature_dims)-i,
+                n_upsample_layers=2+i,
             )
-
             self.upsample_intermediate.append(upsample_block)
 
-        # upsampling patch features (high_res, med_res, low_res) - (3-5) in diagram
-        self.upsample_high_res = DepthProUpsampleBlock(
-            input_dims=config.hidden_size,
-            intermediate_dims=config.high_res_feature_dims,
-            output_dims=config.high_res_feature_dims,
-            n_upsample_layers=1,
-        )
-        self.upsample_med_res = DepthProUpsampleBlock(
-            input_dims=config.hidden_size,
-            intermediate_dims=config.med_res_feature_dims,
-            output_dims=config.med_res_feature_dims,
-            n_upsample_layers=1,
-        )
-        self.upsample_low_res = DepthProUpsampleBlock(
-            input_dims=config.hidden_size,
-            intermediate_dims=config.low_res_feature_dims,
-            output_dims=config.low_res_feature_dims,
-            n_upsample_layers=1,
-        )
-
         # upsampling image features - (6) in diagram
         self.upsample_image = DepthProUpsampleBlock(
             input_dims=config.hidden_size,
             intermediate_dims=config.hidden_size,
-            output_dims=config.image_feature_dims,
+            output_dims=config.scaled_images_feature_dims[0],
             n_upsample_layers=1,
             use_proj=False,
             bias=True,
         )
 
+        # for STEP 7: fuse low_res and image features
+        self.fuse_image_with_low_res = nn.Conv2d(
+            in_channels=config.scaled_images_feature_dims[0]*2,
+            out_channels=config.scaled_images_feature_dims[0],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True,
+        )
+
     def _interpolate(self, pixel_values, scale_factor):
+        if scale_factor == 1:
+            return pixel_values
         return nn.functional.interpolate(
             pixel_values,
             size=None,
@@ -682,6 +712,10 @@ def _interpolate(self, pixel_values, scale_factor):
         )
 
     def _patch(self, pixel_values, overlap_ratio):
+        if pixel_values.shape[-1] == self.config.patch_size:
+            # create patches only if scaled image is not already equal to patch size
+            return pixel_values
+
         patch_size = self.config.patch_size
         stride = int(patch_size * (1 - overlap_ratio))
 
@@ -712,7 +746,11 @@ def _reshape_feature(
         return hidden_states
 
     def _merge(self, x: torch.Tensor, batch_size: int, merge_out_size: int) -> torch.Tensor:
-        """Merge the patched input into a image with sliding window."""
+        if batch_size == x.shape[0]:
+            # merge only if the patches were created from this scaled image
+            # pathces are not created when scaled image size is equal to patch size
+            return x
+
         # x.shape (num_patches, config.num_channels, self.out_size, self.out_size)
         box_size = int(math.sqrt(x.shape[0] // batch_size))
 
@@ -771,28 +809,35 @@ def forward(
 
         B, C, H, W = pixel_values.shape
 
-        # TODO validate: H = W = image_size
-        # TODO validate: C = num_channels
-        # TODO validate: image_size = patch_size * 4
+        if not (H == W == self.config.image_size):
+            raise ValueError(
+                f"Height={H} and Width={W} doesnot match the specified image_size={self.config.image_size} in config."
+            )
+
+        if not (C == self.config.num_channels):
+            raise ValueError(
+                f"Found {C} channels in image, expected number of channels is {self.config.num_channels} from config."
+            )
 
         # pixel_values.shape (B, config.num_channels, config.image_size, config.image_size)
 
         # STEP 1: create 3-level image
 
-        high_res = pixel_values                         # (B, config.num_channels, config.image_size, config.image_size)
-        med_res = self._interpolate(pixel_values, 0.5)  # (B, config.num_channels, config.image_size//2, config.image_size//2)
-        low_res = self._interpolate(pixel_values, 0.25) # (B, config.num_channels, config.image_size//4, config.image_size//4)
+        scaled_images = []
+        for ratio in self.scaled_images_ratios:
+            scaled_images.append(self._interpolate(pixel_values, ratio))
+            # (B, config.num_channels, config.image_size * ratio, config.image_size * ratio)
 
         # STEP 2: create patches
 
-        high_res_patches = self._patch(high_res, 0.25)  # (-1, config.num_channels, config.patch_size, config.patch_size)
-        med_res_patches = self._patch(med_res, 0.5)     # (-1, config.num_channels, config.patch_size, config.patch_size)
-        low_res_patches = low_res                       # (-1, config.num_channels, config.patch_size, config.patch_size)
-
-        patches = torch.cat(
-            (high_res_patches, med_res_patches, low_res_patches),
-            dim=0,
-        ) # (num_patches, config.num_channels, config.patch_size, config.patch_size)
+        for i in range(self.n_scaled_images):
+            scaled_images[i] = self._patch(
+                scaled_images[i],
+                overlap_ratio=self.scaled_images_overlap_ratios[i],
+            )
+        scaled_images_num_patches = [len(i) for i in scaled_images]
+        patches = torch.cat(scaled_images[::-1], dim=0) # -1 as patch encoder expects high res patches first
+        # (sum(scaled_images_num_patches), config.num_channels, config.patch_size, config.patch_size)
 
         # STEP 3: apply patch and image encoder
 
@@ -803,8 +848,13 @@ def forward(
             output_hidden_states=True, # required for intermediate features
             return_dict=True,
         )
+        scaled_images_last_hidden_state = torch.split_with_sizes(
+            patch_encodings.last_hidden_state,
+            scaled_images_num_patches[::-1]
+        )[::-1] # -1 as patch encoder expects high res patches first
+
         image_encodings = self.image_encoder(
-            pixel_values=low_res_patches,
+            pixel_values=scaled_images[0], # provide least resolution image
             head_mask=head_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
@@ -813,89 +863,87 @@ def forward(
 
         # STEP 4: get patch features (high_res, med_res, low_res) - (3-5) in diagram
 
-        # a. extract hidden_state
-        hidden_state = patch_encodings.last_hidden_state # (num_patches, self.seq_len+1, config.hidden_size)
+        scaled_images_features = []
+        for i in range(self.n_scaled_images):
+            # a. extract hidden_state
+            hidden_state = scaled_images_last_hidden_state[i]
+            # (scaled_images_num_patches[i], self.seq_len+1, config.hidden_size)
 
-        # b. reshape back to image like
-        features = self._reshape_feature(
-            hidden_state, self.out_size, self.out_size
-        ) # (num_patches, config.num_channels, self.out_size, self.out_size)
-        high_res_features, med_res_features, low_res_features = torch.split(
-            features,
-            [len(high_res_patches), len(med_res_patches), len(low_res_patches)],
-            dim=0,
-        ) # (num_patches, config.num_channels, self.out_size, self.out_size)
+            # b. reshape back to image like
+            features = self._reshape_feature(
+                hidden_state, self.out_size, self.out_size
+            ) # (scaled_images_num_patches[i], config.num_channels, self.out_size, self.out_size)
 
-        # c. merge patches back together
-        high_res_features = self._merge(high_res_features, batch_size=B, merge_out_size=self.out_size*4) # (B, config.hidden_size, self.out_size*2**2, self.out_size*2**2)
-        med_res_features = self._merge(med_res_features, batch_size=B, merge_out_size=self.out_size*2)   # (B, config.hidden_size, self.out_size*2**1, self.out_size*2**1)
-        low_res_features = low_res_features # no merge required with low res image  # (B, config.hidden_size, self.out_size*2**0, self.out_size*2**0)
+            # c. merge patches back together
+            features = self._merge(
+                features, batch_size=B, merge_out_size=self.out_size*2**i
+            ) # (B, config.hidden_size, self.out_size*2**i, self.out_size*2**i)
 
-        # d. upsample
-        high_res_features = self.upsample_high_res(high_res_features)   # (B, config.high_res_feature_dims, self.out_size*2**3, self.out_size*2**3)
-        med_res_features = self.upsample_med_res(med_res_features)      # (B, config.med_res_feature_dims, self.out_size*2**2, self.out_size*2**2)
-        low_res_features = self.upsample_low_res(low_res_features)      # (B, config.low_res_feature_dims, self.out_size*2**1, self.out_size*2**1)
+            # d. upsample
+            features = self.upsample_scaled_images[i](features)
+            # (B, self.scaled_images_feature_dims[i], self.out_size*2**(i+1), self.out_size*2**(i+1))
+
+            scaled_images_features.append(features)
 
         # STEP 5: get intermediate features - (1-2) in diagram
 
         intermediate_features = []
-        for i, layer_id in enumerate(self.patch_encoder_hook_ids):
+        for i in range(self.n_intermediate_hooks):
 
             # a. extract hidden_state
-            hidden_state = patch_encodings.hidden_states[layer_id+1] # +1 to correct index position as hidden_states contain embedding output as well
-            # (num_patches, self.seq_len+1, config.hidden_size)
+            layer_id = self.intermediate_hook_ids[i] + 1 # +1 to correct index position as hidden_states contain embedding output as well
+            hidden_state = patch_encodings.hidden_states[layer_id]
+            hidden_state = hidden_state[:scaled_images_num_patches[-1]] # num_patches to be of same length as highest resolution
+            # (scaled_images_num_patches[-1], self.seq_len+1, config.hidden_size)
 
             # b. reshape back to image like
             features = self._reshape_feature(
                 hidden_state,
                 self.out_size,
                 self.out_size,
-            ) # (num_patches, config.hidden_size, self.out_size, self.out_size)
+            ) # (scaled_images_num_patches[-1], config.hidden_size, self.out_size, self.out_size)
 
             # c. merge patches back together
             features = self._merge(
-                features[: B * 5 * 5], batch_size=B, merge_out_size=self.out_size*4,
-            ) # (B, config.hidden_size, self.out_size*2**2, self.out_size*2**2)
+                features, batch_size=B, merge_out_size=self.out_size*2**(self.n_scaled_images-1),
+            ) # (B, config.hidden_size, self.out_size*2**(self.n_scaled_images-1), self.out_size*2**(self.n_scaled_images-1))
 
             # d. upsample
-            features = self.upsample_intermediate[layer_id](features)
-            # (B, config.intermediate_feature_dims[i], self.out_size*2**(3+total-i), self.out_size*2**(3+total-i))
+            features = self.upsample_intermediate[i](features)
+            # (B, config.intermediate_feature_dims[i], self.out_size*2**(self.n_scaled_images+i+1), self.out_size*2**(self.n_scaled_images+i+1))
 
             intermediate_features.append(features)
 
         # STEP 6: get image features - (6) in diagram
 
         # a. extract hidden_state
-        hidden_state = image_encodings.last_hidden_state # (num_patches, self.seq_len+1, config.hidden_size)
+        hidden_state = image_encodings.last_hidden_state # (scaled_images_num_patches[0], self.seq_len+1, config.hidden_size)
 
         # b. reshape back to image like
         image_features = self._reshape_feature(
             hidden_state, self.out_size, self.out_size
-        ) # (num_patches, config.hidden_size, self.out_size, self.out_size)
+        ) # (scaled_images_num_patches[0], config.hidden_size, self.out_size, self.out_size)
 
         # c. merge patches back together
-        # skipped, no merge required with low res image
+        image_features = self._merge(
+            image_features, batch_size=B, merge_out_size=self.out_size*2**(0),
+        ) # (B, config.hidden_size, self.out_size*2**(self.n_scaled_images-1), self.out_size*2**(self.n_scaled_images-1))
 
         # d. upsample
-        image_features = self.upsample_image(image_features) # (B, config.image_feature_dims, self.out_size*2**1, self.out_size*2**1)
-
-        # STEP 7: return these features
-        last_hidden_state =  [
-            *intermediate_features, # (B, config.image_feature_dims, self.out_size*2**3+total-i, self.out_size*2**3+total-i)
-            high_res_features,      # (B, config.image_feature_dims, self.out_size*2**3, self.out_size*2**3)
-            med_res_features,       # (B, config.image_feature_dims, self.out_size*2**2, self.out_size*2**2)
-            low_res_features,       # (B, config.image_feature_dims, self.out_size*2**1, self.out_size*2**1)
-            image_features,         # (B, config.image_feature_dims, self.out_size*2**1, self.out_size*2**1)
+        image_features = self.upsample_image(image_features) # (B, config.scaled_images_feature_dims[0], self.out_size*2**1, self.out_size*2**1)
+
+        # STEP 7: apply fusion (global_features = image_features + scaled_images_features[0])
+        # fuses image_features with lowest resolution features as they are of same size
+        scaled_images_features[0] = torch.cat((scaled_images_features[0], image_features), dim=1)
+        scaled_images_features[0] = self.fuse_image_with_low_res(scaled_images_features[0])
+
+        # STEP 8: return these features in order of increasing size as what decoder expects
+        last_hidden_state = [
+            # (B, self.scaled_images_feature_dims[i], self.out_size*2**(i+1), self.out_size*2**(i+1))
+            *scaled_images_features, 
+            # (B, config.intermediate_feature_dims[i], self.out_size*2**(self.n_scaled_images+i+1), self.out_size*2**(self.n_scaled_images+i+1))
+            *intermediate_features,
         ]
-        # for i in last_hidden_state:
-            # ic(i.shape)
-        # exit()
-
-        #  768, 384, 192,  96, 48, 48 - image_size=1536
-        #  384, 192,  96,  48, 24, 24 - image_size=768 (ideal)
-        #  288, 144,  72,  24, 24, 24 - image_size=768 (practical)
-        # 1536, 768, 384, 192, 96, 96 - image_size=3072 (ideal)
-        # 1728, 864, 432, 240, 96, 96 - image_size=3072 (practical)
 
         hidden_states = patch_encodings.hidden_states + image_encodings.hidden_states if output_hidden_states else None
         attentions = patch_encodings.attentions + image_encodings.attentions if output_attentions else None
@@ -910,84 +958,133 @@ def forward(
         )
 
 
-class DepthProFOVModel(nn.Module):
-    def __init__(self, config: DepthProConfig) -> None:
-        super().__init__()
+class DepthProPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = DepthProConfig
+    base_model_prefix = "depth_pro"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["DepthProViTSwiGLUFFN"]
+    _supports_sdpa = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d, nn.ConvTranspose2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+DEPTH_PRO_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`DepthProConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+DEPTH_PRO_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`DPTImageProcessor.__call__`]
+            for details.
+
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare DepthPro Model transformer outputting raw hidden-states without any specific head on top.",
+    DEPTH_PRO_START_DOCSTRING,
+)
+class DepthProModel(DepthProPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
         self.config = config
-        self.hidden_size = config.hidden_size
-        self.decoder_hidden_size = config.decoder_hidden_size
+        self.encoder = DepthProEncoder(config)
+        # Initialize weights and apply final processing
+        self.post_init()
 
-        self.encoder = DepthProViT(config)
-        self.encoder_neck = nn.Linear(self.hidden_size, self.decoder_hidden_size // 2)
-        self.global_neck = nn.Sequential(
-            nn.Conv2d(self.decoder_hidden_size, self.decoder_hidden_size // 2, kernel_size=3, stride=2, padding=1),
-            nn.ReLU(True)
-        )
-        self.head = nn.Sequential(
-            nn.Conv2d(self.decoder_hidden_size // 2, self.decoder_hidden_size // 4, kernel_size=3, stride=2, padding=1), 
-            nn.ReLU(True),
-            nn.Conv2d(self.decoder_hidden_size // 4, self.decoder_hidden_size // 8, kernel_size=3, stride=2, padding=1),
-            nn.ReLU(True),
-            nn.Conv2d(self.decoder_hidden_size // 8, 1, kernel_size=6, stride=1, padding=0),
-        )
+    def get_input_embeddings(self):
+        embeddings = {
+            "patch_embeddings": self.encoder.patch_encoder.embeddings.patch_embeddings,
+            "image_embeddings": self.encoder.image_encoder.embeddings.patch_embeddings,
+        }
+        return embeddings
 
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.patch_encoder.encoder.layer[layer].attention.prune_heads(heads)
+            self.encoder.image_encoder.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(DEPTH_PRO_INPUTS_DOCSTRING)
+    # TODO
+    # @add_code_sample_docstrings(
+    #     checkpoint=_CHECKPOINT_FOR_DOC,
+    #     output_type=BaseModelOutputWithPoolingAndIntermediateActivations,
+    #     config_class=_CONFIG_FOR_DOC,
+    #     modality="vision",
+    #     expected_output=_EXPECTED_OUTPUT_SHAPE,
+    # )
     def forward(
         self,
-        pixel_values: torch.Tensor,
-        global_features: torch.Tensor,
-        head_mask: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-        output_hidden_states: bool = False,
-        return_dict: bool = True,
-    ) -> Union[tuple, BaseModelOutput]:
+        pixel_values: torch.FloatTensor,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        pixel_values = nn.functional.interpolate(
-            pixel_values,
-            size=None,
-            scale_factor=0.25,
-            mode="bilinear",
-            align_corners=False,
-        )
-        encoder_outputs = self.encoder(
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        encodings = self.encoder(
             pixel_values,
-            head_mask=head_mask,
+            head_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
         )
-        last_hidden_state = encoder_outputs[0]
-
-        last_hidden_state = self.encoder_neck(last_hidden_state)
-
-        last_hidden_state = last_hidden_state[:, 1:] # ignore cls_token
-        last_hidden_state = last_hidden_state.permute(0, 2, 1)
-
-        global_features = self.global_neck(global_features)
-
-        ic(last_hidden_state.shape)
-        ic(global_features.shape)
 
-        # exit()
-
-        last_hidden_state = last_hidden_state.reshape_as(global_features)
-        last_hidden_state = last_hidden_state + global_features
-        fov_output = self.head(last_hidden_state)
-        fov_output = fov_output.reshape(1)
-
-        if not return_dict:
-            head_outputs = (fov_output,)
-            return head_outputs + encoder_outputs[1:]
-
-        return BaseModelOutput(
-            last_hidden_state=fov_output,
-            hidden_states=encoder_outputs.hidden_states,
-            attentions=encoder_outputs.attentions,
-        )
+        return encodings
 
 
 # Copied from transformers.models.dpt.modeling_dpt.DPTPreActResidualLayer DPTPreAct->DepthPro
@@ -1075,325 +1172,109 @@ def forward(self, hidden_state, residual=None):
         return hidden_state
 
 
-# Implementation resembles transformers.models.dpt.modeling_dpt.DPTFeatureFusionStage
-class DepthProDecoder(nn.Module):
-    def __init__(self, config: DepthProConfig) -> None:
+# Copied from transformers.models.dpt.modeling_dpt.DPTFeatureFusionStage with DPT->DepthPro with extra layer parameters
+class DepthProFeatureFusionStage(nn.Module):
+    def __init__(self, config, num_layers):
         super().__init__()
-        self.config = config
-
-        # for STEP 2: fuse low_res and image features
-        self.fuse_image_with_low_res = nn.Conv2d(
-            in_channels=config.low_res_feature_dims+config.image_feature_dims,
-            out_channels=config.global_feature_dims,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            bias=True,
-        )
-
-        # for STEP 3: apply decoder block for global features
-        self.global_proj = nn.Conv2d(
-            in_channels=config.global_feature_dims,
-            out_channels=config.decoder_hidden_size,
-            kernel_size=3,
-            stride=1,
-            padding=1,
-            bias=False,
-        )
-        self.global_fusion = DepthProFeatureFusionLayer(config)
-
-        # for STEP 4: apply decoder block for med features
-        self.med_res_proj = nn.Conv2d(
-            in_channels=config.med_res_feature_dims,
-            out_channels=config.decoder_hidden_size,
-            kernel_size=3,
-            stride=1,
-            padding=1,
-            bias=False,
-        )
-        self.med_res_fusion = DepthProFeatureFusionLayer(config)
-
-        # for STEP 5: apply decoder block for high features
-        self.high_res_proj = nn.Conv2d(
-            in_channels=config.high_res_feature_dims,
-            out_channels=config.decoder_hidden_size,
-            kernel_size=3,
-            stride=1,
-            padding=1,
-            bias=False,
-        )
-        self.high_res_fusion = DepthProFeatureFusionLayer(config)
-
-        # for STEP 6: apply decoder block for intermediate features
-        self.intermediate_proj = nn.Sequential()
-        self.intermediate_fusion = nn.Sequential()
-        for i, feature_dim in enumerate(config.intermediate_feature_dims):
-            if i == 0:
-                # no projection for final intermediate layer
-                if feature_dim == config.decoder_hidden_size:
-                    proj = nn.Identity()
-                else:
-                    proj = nn.Conv2d(
-                        in_channels=feature_dim,
-                        out_channels=config.decoder_hidden_size,
-                        kernel_size=1,
-                        bias=False,
-                    )
-                fusion = DepthProFeatureFusionLayer(config, use_deconv=False)
-            else:
-                proj = nn.Conv2d(
-                    in_channels=feature_dim,
-                    out_channels=config.decoder_hidden_size,
-                    kernel_size=3,
-                    stride=1,
-                    padding=1,
-                    bias=False,
-                )
-                fusion = DepthProFeatureFusionLayer(config)
-
-            self.intermediate_proj.append(proj)
-            self.intermediate_fusion.append(fusion)
+        self.num_layers = num_layers
+        self.layers = nn.ModuleList()
+        for _ in range(self.num_layers-1):
+            self.layers.append(DepthProFeatureFusionLayer(config))
+        # final layer doesnot require deconvolution
+        self.layers.append(DepthProFeatureFusionLayer(config, use_deconv=False))
 
     def forward(self, hidden_states):
-        ic("Start of Decoder")
-
-        for i in hidden_states:
-            ic(i.shape)
-
-        # STEP 1: extract features
-
-        intermediate_features = hidden_states[:-4]
-        # intermediate_features_i.shape: [batch_size, config.intermediate_feature_dims_i, 768, 768], [1, 256, 384, 384]
-        high_res_features = hidden_states[-4]
-        # high_res_features.shape: [batch_size, config.high_res_feature_dims, 192, 192]
-        med_res_features = hidden_states[-3]
-        # med_res_features.shape: [batch_size, config.med_res_feature_dims, 96, 96]
-        low_res_features = hidden_states[-2]
-        # low_res_features.shape: [batch_size, config.low_res_feature_dims, 48, 48]
-        image_features = hidden_states[-1]
-        # image_features.shape: [batch_size, config.image_feature_dims, 48, 48]
-
-        # STEP 2: fuse low_res and image features
-
-        global_features = torch.cat((low_res_features, image_features), dim=1)
-        global_features = self.fuse_image_with_low_res(global_features)
-        # global_features.shape: [batch_size, config.global_feature_dims, 48, 48]
-
-        # STEP 3: apply decoder block for global features
-
-        # apply projection: used by fusion now and then fov later
-        global_projected = self.global_proj(global_features)
-        # apply fusion: used by next projections and fusions
-        last_features = self.global_fusion(global_projected)
-        # last_features.shape: [batch_size, config.decoder_hidden_size, 96, 96]
-
-        # STEP 4: apply decoder block for med features
-
-        projected = self.med_res_proj(med_res_features)
-        last_features = self.med_res_fusion(last_features, projected)
-        # last_features.shape: [batch_size, config.decoder_hidden_size, 192, 192]
-
-        # STEP 5: apply decoder block for high features
-
-        projected = self.high_res_proj(high_res_features)
-        last_features = self.high_res_fusion(last_features, projected)
-        # last_features.shape: [batch_size, config.decoder_hidden_size, 384, 384]
-
-        # STEP 6: apply decoder block for intermediate features
-
-        for (features, proj_layer, fusion_layer) in zip(
-            # reversed becuase decoding is applied from last features to first features
-            intermediate_features[::-1],
-            self.intermediate_proj[::-1],
-            self.intermediate_fusion[::-1],
-        ):
-            projected = proj_layer(features)
-            last_features = fusion_layer(last_features, projected)
-            # last_features.shape: [batch_size, config.decoder_hidden_size, 768, 768]
-            # last_features.shape: [batch_size, config.decoder_hidden_size, 768, 768]
-
-        return last_features, global_projected
-
-
-class DepthProPreTrainedModel(PreTrainedModel):
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
-    """
-
-    config_class = DepthProConfig
-    base_model_prefix = "depth_pro"
-    main_input_name = "pixel_values"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["DepthProViTSwiGLUFFN"]
-    _supports_sdpa = True
-
-    def _init_weights(self, module):
-        """Initialize the weights"""
-        if isinstance(module, (nn.Linear, nn.Conv2d, nn.ConvTranspose2d)):
-            # Slightly different from the TF version which uses truncated_normal for initialization
-            # cf https://github.com/pytorch/pytorch/pull/5617
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.LayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-
-
-DEPTH_PRO_START_DOCSTRING = r"""
-    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
-    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
-    behavior.
-
-    Parameters:
-        config ([`DepthProConfig`]): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-DEPTH_PRO_INPUTS_DOCSTRING = r"""
-    Args:
-        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`DPTImageProcessor.__call__`]
-            for details.
-
-        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
-            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
-"""
-
+        if self.num_layers != len(hidden_states):
+            raise ValueError(
+                f"num_layers={self.num_layers} in DepthProFeatureFusionStage"
+                f"doesnot match len(hidden_states)={len(hidden_states)}"
+            )
 
-@dataclass
-class DepthProModelOutput(BaseModelOutput):
-    """
-    Base class for model's outputs, with potential fov, hidden states and attentions.
+        # first layer only uses the last hidden_state
+        fused_hidden_state = self.layers[0](hidden_states[0])
+        # looping from the second layer to last layer
+        for hidden_state, layer in zip(hidden_states[1:], self.layers[1:]):
+            fused_hidden_state = layer(fused_hidden_state, hidden_state)
 
-    Args:
-        fov (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `use_fov_model` is provided):
-            Field of View Scaler.
-    """
-    fov: Optional[torch.FloatTensor] = None
+        return fused_hidden_state
 
 
-@add_start_docstrings(
-    "The bare DepthPro Model transformer outputting raw hidden-states without any specific head on top.",
-    DEPTH_PRO_START_DOCSTRING,
-)
-class DepthProModel(DepthProPreTrainedModel):
-    def __init__(self, config, use_fov_model=None):
-        super().__init__(config)
+class DepthProFOVModel(nn.Module):
+    def __init__(self, config: DepthProConfig) -> None:
+        super().__init__()
         self.config = config
-        self.use_fov_model = use_fov_model if use_fov_model is not None else self.config.use_fov_model
-
-        # dinov2 (vit) like encoder
-        self.encoder = DepthProEncoder(config)
-        # dpt (vit) like decoder
-        self.decoder = DepthProDecoder(config)
-        # dinov2 (vit) like encoder
-        self.fov_model = DepthProFOVModel(config) if self.use_fov_model else None
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        embeddings = {
-            "patch_embeddings": self.encoder.patch_encoder.embeddings.patch_embeddings,
-            "image_embeddings": self.encoder.image_encoder.embeddings.patch_embeddings,
-        }
-        if self.use_fov:
-            embeddings['fov_embeddings'] = self.fov_model.embeddings.patch_embeddings
-        return embeddings
+        self.hidden_size = config.hidden_size
+        self.decoder_hidden_size = config.decoder_hidden_size
 
-    def _prune_heads(self, heads_to_prune):
-        """
-        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
-        class PreTrainedModel
-        """
-        for layer, heads in heads_to_prune.items():
-            self.encoder.patch_encoder.encoder.layer[layer].attention.prune_heads(heads)
-            self.encoder.image_encoder.encoder.layer[layer].attention.prune_heads(heads)
-            self.fov_model.encoder.encoder.layer[layer].attention.prune_heads(heads)
+        self.encoder = DepthProViT(config)
+        self.encoder_neck = nn.Linear(self.hidden_size, self.decoder_hidden_size // 2)
+        self.global_neck = nn.Sequential(
+            nn.Conv2d(self.decoder_hidden_size, self.decoder_hidden_size // 2, kernel_size=3, stride=2, padding=1),
+            nn.ReLU(True)
+        )
+        self.head = nn.Sequential(
+            nn.Conv2d(self.decoder_hidden_size // 2, self.decoder_hidden_size // 4, kernel_size=3, stride=2, padding=1), 
+            nn.ReLU(True),
+            nn.Conv2d(self.decoder_hidden_size // 4, self.decoder_hidden_size // 8, kernel_size=3, stride=2, padding=1),
+            nn.ReLU(True),
+            nn.Conv2d(self.decoder_hidden_size // 8, 1, kernel_size=6, stride=1, padding=0),
+        )
 
-    @add_start_docstrings_to_model_forward(DEPTH_PRO_INPUTS_DOCSTRING)
-    # TODO
-    # @add_code_sample_docstrings(
-    #     checkpoint=_CHECKPOINT_FOR_DOC,
-    #     output_type=BaseModelOutputWithPoolingAndIntermediateActivations,
-    #     config_class=_CONFIG_FOR_DOC,
-    #     modality="vision",
-    #     expected_output=_EXPECTED_OUTPUT_SHAPE,
-    # )
     def forward(
         self,
-        pixel_values: torch.FloatTensor,
-        head_mask: Optional[torch.FloatTensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutput]:
+        pixel_values: torch.Tensor,
+        global_features: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, BaseModelOutput]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
-
-        encodings = self.encoder(
+        pixel_values = nn.functional.interpolate(
             pixel_values,
-            head_mask,
+            size=None,
+            scale_factor=0.25,
+            mode="bilinear",
+            align_corners=False,
+        )
+        encoder_outputs = self.encoder(
+            pixel_values,
+            head_mask=head_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=True,
+            return_dict=return_dict,
         )
+        last_hidden_state = encoder_outputs[0]
 
-        last_hidden_state = encodings.last_hidden_state
-        last_hidden_state, global_features = self.decoder(last_hidden_state)
+        last_hidden_state = self.encoder_neck(last_hidden_state)
 
-        if self.use_fov_model:
-            fov_encodings = self.fov_model(
-                pixel_values=pixel_values,
-                global_features=global_features.detach(),
-                head_mask=head_mask,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict=True,
-            )
-            fov = fov_encodings.last_hidden_state
-            attentions = encodings.attentions + fov_encodings.attentions if output_attentions else None
-            hidden_states = encodings.hidden_states + fov_encodings.hidden_states if output_hidden_states else None
-        else:
-            fov = None
-            attentions = encodings.attentions
-            hidden_states = encodings.hidden_states
+        last_hidden_state = last_hidden_state[:, 1:] # ignore cls_token
+        last_hidden_state = last_hidden_state.permute(0, 2, 1)
+
+        global_features = self.global_neck(global_features)
+
+        ic(last_hidden_state.shape)
+        ic(global_features.shape)
+
+
+        last_hidden_state = last_hidden_state.reshape_as(global_features)
+        last_hidden_state = last_hidden_state + global_features
+        fov_output = self.head(last_hidden_state)
+        fov_output = fov_output.reshape(1)
 
         if not return_dict:
-            outputs = (last_hidden_state, fov, hidden_states, attentions)
-            outputs = (i for i in outputs if i is not None)
-            return outputs
+            head_outputs = (fov_output,)
+            return head_outputs + encoder_outputs[1:]
 
-        return DepthProModelOutput(
-            last_hidden_state=last_hidden_state,
-            fov=fov,
-            hidden_states=hidden_states,
-            attentions=attentions,
+        return BaseModelOutput(
+            last_hidden_state=fov_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
         )
 
 
@@ -1422,7 +1303,6 @@ def __init__(self, config):
             nn.ReLU(),
         )
 
-
     def forward(self, hidden_states: List[torch.Tensor]) -> torch.Tensor:
         predicted_depth = self.head(hidden_states)
         predicted_depth = predicted_depth.squeeze(dim=1)
@@ -1450,14 +1330,45 @@ class DepthProDepthEstimatorOutput(DepthEstimatorOutput):
 class DepthProForDepthEstimation(DepthProPreTrainedModel):
     def __init__(self, config, use_fov_model=None):
         super().__init__(config)
+        self.config = config
         self.use_fov_model = use_fov_model if use_fov_model is not None else self.config.use_fov_model
 
-        self.depth_pro = DepthProModel(config, use_fov_model=self.use_fov_model)
+        # dinov2 (vit) like encoders
+        self.depth_pro = DepthProModel(config)
+
+        # project hidden states from encoder to match expected inputs in fusion stage
+        combined_feature_dims = config.scaled_images_feature_dims + config.intermediate_feature_dims
+        self.projections = nn.ModuleList()
+        for i, in_channels in enumerate(combined_feature_dims):
+            if i == len(combined_feature_dims)-1 and in_channels == config.decoder_hidden_size:
+                # projection for last layer can be ignored if input and output channels already match
+                self.projections.append(nn.Identity())
+            else:
+                self.projections.append(
+                    nn.Conv2d(
+                        in_channels=in_channels,
+                        out_channels=config.decoder_hidden_size,
+                        kernel_size=3,
+                        stride=1,
+                        padding=1,
+                        bias=False,
+                    )
+                )
+
+        # dpt (vit) like fusion stage
+        self.num_decoder_layers = len(config.intermediate_hook_ids) + len(config.scaled_images_ratios)
+        self.fusion_stage = DepthProFeatureFusionStage(config, num_layers=self.num_decoder_layers)
+
+        # depth estimation head
         self.head = DepthProDepthEstimationHead(config)
 
+        # dinov2 (vit) like encoder
+        self.fov_model = DepthProFOVModel(config) if self.use_fov_model else None
+
         # Initialize weights and apply final processing
         self.post_init()
 
+
     @add_start_docstrings_to_model_forward(DEPTH_PRO_INPUTS_DOCSTRING)
     # @replace_return_docstrings(output_type=DepthEstimatorOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
@@ -1476,6 +1387,7 @@ def forward(
         Returns:
 
         Examples:
+        TODO
         ```python
         >>> from transformers import AutoImageProcessor, DPTForDepthEstimation
         >>> import torch
@@ -1526,21 +1438,39 @@ def forward(
             output_hidden_states=output_hidden_states,
             return_dict=True,
         )
-        last_hidden_state = depth_pro_outputs[0]
-        ic(last_hidden_state.shape)
-        predicted_depth = self.head(last_hidden_state)
-        ic(predicted_depth.shape)
+        last_hidden_state = depth_pro_outputs.last_hidden_state
+        last_hidden_state = [proj(state) for proj, state in zip(self.projections, last_hidden_state)]
+        fused_state = self.fusion_stage(last_hidden_state)
+        predicted_depth = self.head(fused_state)
+
+        if self.use_fov_model:
+            # use lowest scaled image features for fov model
+            global_features = last_hidden_state[0].detach()
+            fov_encodings = self.fov_model(
+                pixel_values=pixel_values,
+                global_features=global_features,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=True,
+            )
+            fov = fov_encodings.last_hidden_state
+            attentions = depth_pro_outputs.attentions + fov_encodings.attentions if output_attentions else None
+            hidden_states = depth_pro_outputs.hidden_states + fov_encodings.hidden_states if output_hidden_states else None
+        else:
+            fov = None
+            attentions = depth_pro_outputs.attentions
+            hidden_states = depth_pro_outputs.hidden_states
 
         if not return_dict:
-            if loss is None:
-                return (predicted_depth,) + depth_pro_outputs[1:]
-            else:
-                return (loss, predicted_depth) + depth_pro_outputs[1:]
+            outputs = (predicted_depth, fov, hidden_states, attentions)
+            outputs = (i for i in outputs if i is not None)
+            return outputs
 
         return DepthProDepthEstimatorOutput(
             loss=loss,
             predicted_depth=predicted_depth,
-            fov=depth_pro_outputs.fov,
-            hidden_states=depth_pro_outputs.hidden_states,
-            attentions=depth_pro_outputs.attentions,
+            fov=fov,
+            hidden_states=hidden_states,
+            attentions=attentions,
         )

From 6be242ce30589132e71bd437fd6016827c3d8b6a Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Thu, 21 Nov 2024 13:51:45 +0500
Subject: [PATCH 015/151] make fov model compatible with custom config

---
 .../depth_pro/configuration_depth_pro.py      |   2 +
 .../models/depth_pro/modeling_depth_pro.py    | 267 ++++++++++--------
 2 files changed, 150 insertions(+), 119 deletions(-)

diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py
index 055830900417..8e197dbd0dab 100644
--- a/src/transformers/models/depth_pro/configuration_depth_pro.py
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@@ -138,6 +138,7 @@ def __init__(
         scaled_images_feature_dims = [1024, 1024, 512],
         use_batch_norm_in_decoder=False,
         use_fov_model=False,
+        num_fov_head_layers=2,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -168,6 +169,7 @@ def __init__(
         self.reshape_hidden_states = reshape_hidden_states
         self.use_batch_norm_in_decoder = use_batch_norm_in_decoder
         self.use_fov_model = use_fov_model
+        self.num_fov_head_layers = num_fov_head_layers
         self.intermediate_hook_ids = intermediate_hook_ids
         self.intermediate_feature_dims = intermediate_feature_dims
         self.scaled_images_ratios = scaled_images_ratios
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 9f146177402c..0ddd503c4cc9 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -610,6 +610,97 @@ def forward(self, features):
         projected = self.proj(features)
         return self.upsample_blocks(projected)
 
+
+def interpolate(pixel_values, scale_factor):
+    return nn.functional.interpolate(
+        pixel_values,
+        size=None,
+        scale_factor=scale_factor,
+        mode="bilinear",
+        align_corners=False,
+    )
+
+def patch(pixel_values, patch_size, overlap_ratio):
+    """Creates Patches from Batch."""
+    B, C, W, H = pixel_values.shape
+
+    if W == H == patch_size:
+        # create patches only if scaled image is not already equal to patch size
+        return pixel_values
+
+    stride = int(patch_size * (1 - overlap_ratio))
+
+    # (B, C, W, H)
+    patches = torch.nn.functional.unfold(
+        pixel_values, kernel_size=(patch_size, patch_size), stride=(stride, stride)
+    )
+    # patches.shape (B, patch_size**2 * C, num_patches)
+    patches = patches.permute(2, 0, 1)
+    # patches.shape (num_patches, B, patch_size**2 * C)
+    patches = patches.reshape(-1, C, patch_size, patch_size)
+    # patches.shape (B * num_patches, C, patch_size, patch_size)
+
+    return patches
+
+def reshape_feature(hidden_states, width, height):
+    """Discard class token and reshape 1D feature map to a 2D grid."""
+    B, _, C = hidden_states.shape
+    # (B, WH+1, C)
+    hidden_states = hidden_states[:, 1:, :] # remove class token
+    # (B, WH, C)
+    hidden_states = hidden_states.reshape(B, width, height, C)
+    # (B, W, H, C)
+    hidden_states = hidden_states.permute(0, 3, 1, 2)
+    # (B, C, W, H)
+    return hidden_states
+
+def merge(patches, batch_size, merge_out_size):
+    """Recreates Batch from Patches."""
+    num_patches, num_channels, out_size, out_size = patches.shape
+
+    if num_patches == batch_size:
+        # merge only if the patches were created from scaled image
+        # patches are not created when scaled image size is equal to patch size
+        return patches
+
+    box_size = int(math.sqrt(num_patches // batch_size))
+    """
+    merge_out_size = (box_size - 2) * (out_size - 2 * padding) + (2) * (out_size - padding)
+    padding = (merge_out_size - box_size * out_size) / (6 - 2 * box_size)
+    """
+    padding = ( box_size * out_size - merge_out_size ) // ( 2 * box_size - 2 )
+
+    i = 0
+    boxes = []
+    for h in range(box_size):
+        boxes_in_row = []
+        for w in range(box_size):
+            box = patches[batch_size * i : batch_size * (i + 1)]
+
+            if h != 0:
+                # remove pad from height if box is not at top border
+                box = box[..., padding:, :]
+            if w != 0:
+                # remove pad from width if box is not at left border
+                box = box[..., :, padding:]
+            if h != box_size - 1:
+                # remove pad from height if box is not at bottom border
+                box = box[..., :box.shape[-2]-padding, :]
+            if w != box_size - 1:
+                # remove pad from width if box is not at right border
+                box = box[..., :, :box.shape[-1]-padding]
+
+            boxes_in_row.append(box)
+            i += 1
+
+        boxes_in_row = torch.cat(boxes_in_row, dim=-1)
+        boxes.append(boxes_in_row)
+
+    boxes = torch.cat(boxes, dim=-2)
+    boxes = boxes[..., :merge_out_size, :merge_out_size]
+    return boxes
+
+
 class DepthProEncoder(nn.Module):
     def __init__(self, config: DepthProConfig) -> None:
         super().__init__()
@@ -700,96 +791,6 @@ def __init__(self, config: DepthProConfig) -> None:
             bias=True,
         )
 
-    def _interpolate(self, pixel_values, scale_factor):
-        if scale_factor == 1:
-            return pixel_values
-        return nn.functional.interpolate(
-            pixel_values,
-            size=None,
-            scale_factor=scale_factor,
-            mode="bilinear",
-            align_corners=False,
-        )
-
-    def _patch(self, pixel_values, overlap_ratio):
-        if pixel_values.shape[-1] == self.config.patch_size:
-            # create patches only if scaled image is not already equal to patch size
-            return pixel_values
-
-        patch_size = self.config.patch_size
-        stride = int(patch_size * (1 - overlap_ratio))
-
-        # pixel_values.shape (B, config.num_channels, config.image_size, config.image_size)
-        patches = torch.nn.functional.unfold(
-            pixel_values, kernel_size=(patch_size, patch_size), stride=(stride, stride)
-        )
-        # patches.shape (B, -1, num_patches)
-        patches = patches.permute(2, 0, 1)
-        # patches.shape (num_patches, B, -1)
-        patches = patches.reshape(-1, self.config.num_channels, patch_size, patch_size)
-        # patches.shape (B * num_patches, config.num_channels, config.patch_size, config.patch_size)
-
-        return patches
-
-    def _reshape_feature(
-        self, hidden_states: torch.Tensor, width, height, cls_token_offset=1
-    ):
-        """Discard class token and reshape 1D feature map to a 2D grid."""
-        b, hw, c = hidden_states.shape
-
-        # Remove class token.
-        if cls_token_offset > 0:
-            hidden_states = hidden_states[:, cls_token_offset:, :]
-
-        # Shape: (batch, height, width, dim) -> (batch, dim, height, width)
-        hidden_states = hidden_states.reshape(b, height, width, c).permute(0, 3, 1, 2)
-        return hidden_states
-
-    def _merge(self, x: torch.Tensor, batch_size: int, merge_out_size: int) -> torch.Tensor:
-        if batch_size == x.shape[0]:
-            # merge only if the patches were created from this scaled image
-            # pathces are not created when scaled image size is equal to patch size
-            return x
-
-        # x.shape (num_patches, config.num_channels, self.out_size, self.out_size)
-        box_size = int(math.sqrt(x.shape[0] // batch_size))
-
-        """
-        merge_out_size = (box_size - 2) * (out_size - 2 * padding) + (2) * (out_size - padding)
-        padding = (merge_out_size - box_size * out_size) / (6 - 2 * box_size)
-        """
-        padding = ( box_size * self.out_size - merge_out_size ) // ( 2 * box_size - 2 )
-
-        i = 0
-        boxes = []
-        for h in range(box_size):
-            boxes_in_row = []
-            for w in range(box_size):
-                box = x[batch_size * i : batch_size * (i + 1)]
-
-                if h != 0:
-                    # remove pad from height if box is not at top border
-                    box = box[..., padding:, :]
-                if w != 0:
-                    # remove pad from width if box is not at left border
-                    box = box[..., :, padding:]
-                if h != box_size - 1:
-                    # remove pad from height if box is not at bottom border
-                    box = box[..., :box.shape[-2]-padding, :]
-                if w != box_size - 1:
-                    # remove pad from width if box is not at right border
-                    box = box[..., :, :box.shape[-1]-padding]
-
-                boxes_in_row.append(box)
-                i += 1
-
-            boxes_in_row = torch.cat(boxes_in_row, dim=-1)
-            boxes.append(boxes_in_row)
-
-        boxes = torch.cat(boxes, dim=-2)
-        boxes = boxes[..., :merge_out_size, :merge_out_size]
-        return boxes
-
     def forward(
         self,
         pixel_values: torch.Tensor,
@@ -825,14 +826,15 @@ def forward(
 
         scaled_images = []
         for ratio in self.scaled_images_ratios:
-            scaled_images.append(self._interpolate(pixel_values, ratio))
+            scaled_images.append(interpolate(pixel_values, ratio))
             # (B, config.num_channels, config.image_size * ratio, config.image_size * ratio)
 
         # STEP 2: create patches
 
         for i in range(self.n_scaled_images):
-            scaled_images[i] = self._patch(
+            scaled_images[i] = patch(
                 scaled_images[i],
+                patch_size=self.config.patch_size,
                 overlap_ratio=self.scaled_images_overlap_ratios[i],
             )
         scaled_images_num_patches = [len(i) for i in scaled_images]
@@ -870,12 +872,12 @@ def forward(
             # (scaled_images_num_patches[i], self.seq_len+1, config.hidden_size)
 
             # b. reshape back to image like
-            features = self._reshape_feature(
+            features = reshape_feature(
                 hidden_state, self.out_size, self.out_size
             ) # (scaled_images_num_patches[i], config.num_channels, self.out_size, self.out_size)
 
             # c. merge patches back together
-            features = self._merge(
+            features = merge(
                 features, batch_size=B, merge_out_size=self.out_size*2**i
             ) # (B, config.hidden_size, self.out_size*2**i, self.out_size*2**i)
 
@@ -897,14 +899,14 @@ def forward(
             # (scaled_images_num_patches[-1], self.seq_len+1, config.hidden_size)
 
             # b. reshape back to image like
-            features = self._reshape_feature(
+            features = reshape_feature(
                 hidden_state,
                 self.out_size,
                 self.out_size,
             ) # (scaled_images_num_patches[-1], config.hidden_size, self.out_size, self.out_size)
 
             # c. merge patches back together
-            features = self._merge(
+            features = merge(
                 features, batch_size=B, merge_out_size=self.out_size*2**(self.n_scaled_images-1),
             ) # (B, config.hidden_size, self.out_size*2**(self.n_scaled_images-1), self.out_size*2**(self.n_scaled_images-1))
 
@@ -920,12 +922,12 @@ def forward(
         hidden_state = image_encodings.last_hidden_state # (scaled_images_num_patches[0], self.seq_len+1, config.hidden_size)
 
         # b. reshape back to image like
-        image_features = self._reshape_feature(
+        image_features = reshape_feature(
             hidden_state, self.out_size, self.out_size
         ) # (scaled_images_num_patches[0], config.hidden_size, self.out_size, self.out_size)
 
         # c. merge patches back together
-        image_features = self._merge(
+        image_features = merge(
             image_features, batch_size=B, merge_out_size=self.out_size*2**(0),
         ) # (B, config.hidden_size, self.out_size*2**(self.n_scaled_images-1), self.out_size*2**(self.n_scaled_images-1))
 
@@ -1206,18 +1208,39 @@ def __init__(self, config: DepthProConfig) -> None:
         self.hidden_size = config.hidden_size
         self.decoder_hidden_size = config.decoder_hidden_size
 
+        self.out_size = config.patch_size // config.patch_embeddings_size
+
         self.encoder = DepthProViT(config)
         self.encoder_neck = nn.Linear(self.hidden_size, self.decoder_hidden_size // 2)
         self.global_neck = nn.Sequential(
             nn.Conv2d(self.decoder_hidden_size, self.decoder_hidden_size // 2, kernel_size=3, stride=2, padding=1),
             nn.ReLU(True)
         )
-        self.head = nn.Sequential(
-            nn.Conv2d(self.decoder_hidden_size // 2, self.decoder_hidden_size // 4, kernel_size=3, stride=2, padding=1), 
-            nn.ReLU(True),
-            nn.Conv2d(self.decoder_hidden_size // 4, self.decoder_hidden_size // 8, kernel_size=3, stride=2, padding=1),
-            nn.ReLU(True),
-            nn.Conv2d(self.decoder_hidden_size // 8, 1, kernel_size=6, stride=1, padding=0),
+
+        if config.decoder_hidden_size // 2**config.num_fov_head_layers == 0:
+            raise ValueError(
+                f"decoder_hidden_size={config.decoder_hidden_size} should be consistent with config.num_fov_head_layers={config.num_fov_head_layers} "
+                "i.e config.decoder_hidden_size // 2**config.num_fov_head_layers > 0"
+            )
+
+        # create initial head layers
+        self.head = nn.Sequential()
+        for i in range(config.num_fov_head_layers):
+            self.head.append(
+                nn.Conv2d(self.decoder_hidden_size // 2**(i+1), self.decoder_hidden_size // 2**(i+2), kernel_size=3, stride=2, padding=1)
+            )
+            self.head.append(nn.ReLU(True))
+        # calculate expected shapes to finally generate a scalar output from final head layer
+        final_in_channels = self.decoder_hidden_size // 2**(config.num_fov_head_layers+1)
+        final_kernal_size = int((self.out_size - 1) / 2**config.num_fov_head_layers + 1)
+        self.head.append(
+            nn.Conv2d(
+                in_channels=final_in_channels,
+                out_channels=1,
+                kernel_size=final_kernal_size,
+                stride=1,
+                padding=0
+            )
         )
 
     def forward(
@@ -1235,34 +1258,40 @@ def forward(
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        pixel_values = nn.functional.interpolate(
+        B, C, W, H = pixel_values.shape
+
+        # follow the steps same as with image features in DepthProEncoder
+        pixel_values = interpolate(
             pixel_values,
-            size=None,
-            scale_factor=0.25,
-            mode="bilinear",
-            align_corners=False,
+            scale_factor=self.config.scaled_images_ratios[0], # same ratio as lowest ratioed image
         )
-        encoder_outputs = self.encoder(
+        patches = patch(
             pixel_values,
+            patch_size=self.config.patch_size,
+            overlap_ratio=self.config.scaled_images_overlap_ratios[0],
+        )
+        encoder_outputs = self.encoder(
+            patches,
             head_mask=head_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
         )
         last_hidden_state = encoder_outputs[0]
-
         last_hidden_state = self.encoder_neck(last_hidden_state)
-
-        last_hidden_state = last_hidden_state[:, 1:] # ignore cls_token
-        last_hidden_state = last_hidden_state.permute(0, 2, 1)
+        last_hidden_state = reshape_feature(
+            last_hidden_state,
+            width=self.out_size,
+            height=self.out_size
+        )
+        last_hidden_state = merge(
+            last_hidden_state,
+            batch_size=B,
+            merge_out_size=self.out_size,
+        )
 
         global_features = self.global_neck(global_features)
 
-        ic(last_hidden_state.shape)
-        ic(global_features.shape)
-
-
-        last_hidden_state = last_hidden_state.reshape_as(global_features)
         last_hidden_state = last_hidden_state + global_features
         fov_output = self.head(last_hidden_state)
         fov_output = fov_output.reshape(1)

From 01891085f0961ea28049616abed63a8bd9cb2f05 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Thu, 21 Nov 2024 13:54:43 +0500
Subject: [PATCH 016/151] replace word "decoder" with "fusion"

---
 .../depth_pro/configuration_depth_pro.py      | 10 ++---
 .../models/depth_pro/modeling_depth_pro.py    | 44 +++++++++----------
 2 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py
index 8e197dbd0dab..f124d3e5b71a 100644
--- a/src/transformers/models/depth_pro/configuration_depth_pro.py
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@@ -59,7 +59,7 @@ class DepthProConfig(PretrainedConfig):
         layer_norm_eps (`float`, *optional*, defaults to 1e-06):
             The epsilon used by the layer normalization layers.
         image_size (`int`, *optional*, defaults to 224):
-            TODO: image_size / 2**n_decoder_blocks = patch_size / patch_embeddings_size
+            TODO: image_size / 2**n_fusion_blocks = patch_size / patch_embeddings_size
             The size (resolution) of each image.
         patch_size (`int`, *optional*, defaults to 14):
             The size (resolution) of each patch.
@@ -110,7 +110,7 @@ class DepthProConfig(PretrainedConfig):
     def __init__(
         self,
         hidden_size=1024,
-        decoder_hidden_size=256,
+        fusion_hidden_size=256,
         num_hidden_layers=24,
         num_attention_heads=16,
         mlp_ratio=4,
@@ -136,7 +136,7 @@ def __init__(
         scaled_images_ratios = [0.25, 0.5, 1],
         scaled_images_overlap_ratios = [0.0, 0.5, 0.25],
         scaled_images_feature_dims = [1024, 1024, 512],
-        use_batch_norm_in_decoder=False,
+        use_batch_norm_in_fusion=False,
         use_fov_model=False,
         num_fov_head_layers=2,
         **kwargs,
@@ -144,7 +144,7 @@ def __init__(
         super().__init__(**kwargs)
 
         self.hidden_size = hidden_size
-        self.decoder_hidden_size = decoder_hidden_size
+        self.fusion_hidden_size = fusion_hidden_size
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
         self.mlp_ratio = mlp_ratio
@@ -167,7 +167,7 @@ def __init__(
         )
         self.apply_layernorm = apply_layernorm
         self.reshape_hidden_states = reshape_hidden_states
-        self.use_batch_norm_in_decoder = use_batch_norm_in_decoder
+        self.use_batch_norm_in_fusion = use_batch_norm_in_fusion
         self.use_fov_model = use_fov_model
         self.num_fov_head_layers = num_fov_head_layers
         self.intermediate_hook_ids = intermediate_hook_ids
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 0ddd503c4cc9..0ac35b582d7f 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -706,7 +706,7 @@ def __init__(self, config: DepthProConfig) -> None:
         super().__init__()
         self.config = config
         self.hidden_size = config.hidden_size
-        self.decoder_hidden_size = config.decoder_hidden_size
+        self.fusion_hidden_size = config.fusion_hidden_size
 
         self.intermediate_hook_ids = config.intermediate_hook_ids
         self.intermediate_feature_dims = config.intermediate_feature_dims
@@ -762,7 +762,7 @@ def __init__(self, config: DepthProConfig) -> None:
         # upsampling intermediate features - (1-2) in diagram
         self.upsample_intermediate = nn.ModuleList()
         for i, feature_dims in enumerate(self.intermediate_feature_dims):
-            intermediate_dims = self.decoder_hidden_size if i == 0 else feature_dims
+            intermediate_dims = self.fusion_hidden_size if i == 0 else feature_dims
             upsample_block = DepthProUpsampleBlock(
                 input_dims=config.hidden_size,
                 intermediate_dims=intermediate_dims,
@@ -939,7 +939,7 @@ def forward(
         scaled_images_features[0] = torch.cat((scaled_images_features[0], image_features), dim=1)
         scaled_images_features[0] = self.fuse_image_with_low_res(scaled_images_features[0])
 
-        # STEP 8: return these features in order of increasing size as what decoder expects
+        # STEP 8: return these features in order of increasing size as what fusion expects
         last_hidden_state = [
             # (B, self.scaled_images_feature_dims[i], self.out_size*2**(i+1), self.out_size*2**(i+1))
             *scaled_images_features, 
@@ -1094,8 +1094,8 @@ class DepthProResidualLayer(nn.Module):
     def __init__(self, config):
         super().__init__()
 
-        self.use_batch_norm = config.use_batch_norm_in_decoder
-        self.hidden_size = config.decoder_hidden_size
+        self.use_batch_norm = config.use_batch_norm_in_fusion
+        self.hidden_size = config.fusion_hidden_size
 
         self.activation1 = nn.ReLU()
         self.convolution1 = nn.Conv2d(
@@ -1151,15 +1151,15 @@ def __init__(self, config: DepthProConfig, use_deconv:bool=True) -> None:
 
         if self.use_deconv:
             self.deconv = nn.ConvTranspose2d(
-                in_channels=config.decoder_hidden_size,
-                out_channels=config.decoder_hidden_size,
+                in_channels=config.fusion_hidden_size,
+                out_channels=config.fusion_hidden_size,
                 kernel_size=2,
                 stride=2,
                 padding=0,
                 bias=False,
             )
 
-        self.projection = nn.Conv2d(config.decoder_hidden_size, config.decoder_hidden_size, kernel_size=1, bias=True)
+        self.projection = nn.Conv2d(config.fusion_hidden_size, config.fusion_hidden_size, kernel_size=1, bias=True)
         self.skip_add = nn.quantized.FloatFunctional()
 
     def forward(self, hidden_state, residual=None):
@@ -1206,32 +1206,32 @@ def __init__(self, config: DepthProConfig) -> None:
         super().__init__()
         self.config = config
         self.hidden_size = config.hidden_size
-        self.decoder_hidden_size = config.decoder_hidden_size
+        self.fusion_hidden_size = config.fusion_hidden_size
 
         self.out_size = config.patch_size // config.patch_embeddings_size
 
         self.encoder = DepthProViT(config)
-        self.encoder_neck = nn.Linear(self.hidden_size, self.decoder_hidden_size // 2)
+        self.encoder_neck = nn.Linear(self.hidden_size, self.fusion_hidden_size // 2)
         self.global_neck = nn.Sequential(
-            nn.Conv2d(self.decoder_hidden_size, self.decoder_hidden_size // 2, kernel_size=3, stride=2, padding=1),
+            nn.Conv2d(self.fusion_hidden_size, self.fusion_hidden_size // 2, kernel_size=3, stride=2, padding=1),
             nn.ReLU(True)
         )
 
-        if config.decoder_hidden_size // 2**config.num_fov_head_layers == 0:
+        if config.fusion_hidden_size // 2**config.num_fov_head_layers == 0:
             raise ValueError(
-                f"decoder_hidden_size={config.decoder_hidden_size} should be consistent with config.num_fov_head_layers={config.num_fov_head_layers} "
-                "i.e config.decoder_hidden_size // 2**config.num_fov_head_layers > 0"
+                f"fusion_hidden_size={config.fusion_hidden_size} should be consistent with config.num_fov_head_layers={config.num_fov_head_layers} "
+                "i.e config.fusion_hidden_size // 2**config.num_fov_head_layers > 0"
             )
 
         # create initial head layers
         self.head = nn.Sequential()
         for i in range(config.num_fov_head_layers):
             self.head.append(
-                nn.Conv2d(self.decoder_hidden_size // 2**(i+1), self.decoder_hidden_size // 2**(i+2), kernel_size=3, stride=2, padding=1)
+                nn.Conv2d(self.fusion_hidden_size // 2**(i+1), self.fusion_hidden_size // 2**(i+2), kernel_size=3, stride=2, padding=1)
             )
             self.head.append(nn.ReLU(True))
         # calculate expected shapes to finally generate a scalar output from final head layer
-        final_in_channels = self.decoder_hidden_size // 2**(config.num_fov_head_layers+1)
+        final_in_channels = self.fusion_hidden_size // 2**(config.num_fov_head_layers+1)
         final_kernal_size = int((self.out_size - 1) / 2**config.num_fov_head_layers + 1)
         self.head.append(
             nn.Conv2d(
@@ -1311,7 +1311,7 @@ class DepthProDepthEstimationHead(nn.Module):
     """
     The DepthProDepthEstimationHead module serves as the output head for depth estimation tasks.
     This module comprises a sequence of convolutional and transposed convolutional layers
-    that process the feature map from the decoder to produce a single-channel depth map.
+    that process the feature map from the fusion to produce a single-channel depth map.
     Key operations include dimensionality reduction and upsampling to match the input resolution.
     """
 
@@ -1319,7 +1319,7 @@ def __init__(self, config):
         super().__init__()
         self.config = config
 
-        features = config.decoder_hidden_size
+        features = config.fusion_hidden_size
         self.head = nn.Sequential(
             nn.Conv2d(features, features//2, kernel_size=3, stride=1, padding=1),
             nn.ConvTranspose2d(
@@ -1369,14 +1369,14 @@ def __init__(self, config, use_fov_model=None):
         combined_feature_dims = config.scaled_images_feature_dims + config.intermediate_feature_dims
         self.projections = nn.ModuleList()
         for i, in_channels in enumerate(combined_feature_dims):
-            if i == len(combined_feature_dims)-1 and in_channels == config.decoder_hidden_size:
+            if i == len(combined_feature_dims)-1 and in_channels == config.fusion_hidden_size:
                 # projection for last layer can be ignored if input and output channels already match
                 self.projections.append(nn.Identity())
             else:
                 self.projections.append(
                     nn.Conv2d(
                         in_channels=in_channels,
-                        out_channels=config.decoder_hidden_size,
+                        out_channels=config.fusion_hidden_size,
                         kernel_size=3,
                         stride=1,
                         padding=1,
@@ -1385,8 +1385,8 @@ def __init__(self, config, use_fov_model=None):
                 )
 
         # dpt (vit) like fusion stage
-        self.num_decoder_layers = len(config.intermediate_hook_ids) + len(config.scaled_images_ratios)
-        self.fusion_stage = DepthProFeatureFusionStage(config, num_layers=self.num_decoder_layers)
+        self.num_fusion_layers = len(config.intermediate_hook_ids) + len(config.scaled_images_ratios)
+        self.fusion_stage = DepthProFeatureFusionStage(config, num_layers=self.num_fusion_layers)
 
         # depth estimation head
         self.head = DepthProDepthEstimationHead(config)

From 7614e1a709c14c8f9e32730fe240e401ae023ec3 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sun, 24 Nov 2024 13:57:36 +0500
Subject: [PATCH 017/151] weight conversion script

---
 .../depth_pro/convert_depth_pro_to_hf.py      | 344 ++++++++++++++++++
 1 file changed, 344 insertions(+)
 create mode 100644 src/transformers/models/depth_pro/convert_depth_pro_to_hf.py

diff --git a/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py
new file mode 100644
index 000000000000..38b7a7853d76
--- /dev/null
+++ b/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py
@@ -0,0 +1,344 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert DepthPro checkpoints from the original repository.
+
+URL: https://huggingface.co/apple/DepthPro/tree/main
+"""
+
+import argparse
+import json
+from pathlib import Path
+import re
+
+import requests
+import torch
+import torch.nn as nn
+from huggingface_hub import hf_hub_download
+from PIL import Image
+from torchvision import transforms
+
+from transformers import BitImageProcessor, Dinov2Config, Dinov2ForImageClassification, Dinov2Model
+from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, PILImageResampling
+from transformers.utils import logging
+
+# TODO: import directly from transformers
+from transformers.models.depth_pro.configuration_depth_pro import DepthProConfig
+from transformers.models.depth_pro.modeling_depth_pro import DepthProForDepthEstimation
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+
+def create_vit_rename_keys(config):
+    rename_keys = []
+    # fmt: off
+
+    # patch embedding layer
+    rename_keys.append(("cls_token", "embeddings.cls_token"))
+    rename_keys.append(("pos_embed", "embeddings.position_embeddings"))
+    rename_keys.append(("patch_embed.proj.weight", "embeddings.patch_embeddings.projection.weight"))
+    rename_keys.append(("patch_embed.proj.bias", "embeddings.patch_embeddings.projection.bias"))
+
+    for i in range(config.num_hidden_layers):
+        # layernorms
+        rename_keys.append((f"blocks.{i}.norm1.weight", f"encoder.layer.{i}.norm1.weight"))
+        rename_keys.append((f"blocks.{i}.norm1.bias", f"encoder.layer.{i}.norm1.bias"))
+        rename_keys.append((f"blocks.{i}.norm2.weight", f"encoder.layer.{i}.norm2.weight"))
+        rename_keys.append((f"blocks.{i}.norm2.bias", f"encoder.layer.{i}.norm2.bias"))
+        # MLP
+        if config.use_swiglu_ffn:
+            rename_keys.append((f"blocks.{i}.mlp.w12.weight", f"encoder.layer.{i}.mlp.w12.weight"))
+            rename_keys.append((f"blocks.{i}.mlp.w12.bias", f"encoder.layer.{i}.mlp.w12.bias"))
+            rename_keys.append((f"blocks.{i}.mlp.w3.weight", f"encoder.layer.{i}.mlp.w3.weight"))
+            rename_keys.append((f"blocks.{i}.mlp.w3.bias", f"encoder.layer.{i}.mlp.w3.bias"))
+        else:
+            rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"encoder.layer.{i}.mlp.fc1.weight"))
+            rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"encoder.layer.{i}.mlp.fc1.bias"))
+            rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"encoder.layer.{i}.mlp.fc2.weight"))
+            rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"encoder.layer.{i}.mlp.fc2.bias"))
+        # layerscale
+        rename_keys.append((f"blocks.{i}.ls1.gamma", f"encoder.layer.{i}.layer_scale1.lambda1"))
+        rename_keys.append((f"blocks.{i}.ls2.gamma", f"encoder.layer.{i}.layer_scale2.lambda1"))
+        # attention projection layer
+        rename_keys.append((f"blocks.{i}.attn.proj.weight", f"encoder.layer.{i}.attention.output.dense.weight"))
+        rename_keys.append((f"blocks.{i}.attn.proj.bias", f"encoder.layer.{i}.attention.output.dense.bias"))
+
+    # final layernorm
+    rename_keys.append(("norm.weight", "layernorm.weight"))
+    rename_keys.append(("norm.bias", "layernorm.bias"))
+
+    # fmt: on
+    return rename_keys
+
+# we split up the matrix of each encoder layer into queries, keys and values
+def read_in_q_k_v(state_dict, config):
+    state_dict_keys = state_dict.keys()
+    for key in list(state_dict_keys):
+        if "qkv" in key:
+            in_proj = state_dict.pop(key)
+            q, k, v = torch.split(in_proj, config.hidden_size, dim=0)
+
+            if "fov" in key:
+                key = key.replace('fov.encoder.0', 'fov_model.encoder')
+            else:
+                key = "depth_pro." + key
+
+            key = key.replace("blocks", "encoder.layer")
+            state_dict[key.replace("attn.qkv", "attention.attention.query")] = q
+            state_dict[key.replace("attn.qkv", "attention.attention.key")] = k
+            state_dict[key.replace("attn.qkv", "attention.attention.value")] = v
+    return state_dict
+
+# hard coded upsample keys
+def update_hard_coded_keys(state_dict):
+    mapping = [
+        # upsamples
+        ('encoder.upsample_latent0.0.weight', 'depth_pro.encoder.upsample_intermediate.1.proj.weight'),
+        ('encoder.upsample_latent0.1.weight', 'depth_pro.encoder.upsample_intermediate.1.upsample_blocks.0.weight'),
+        ('encoder.upsample_latent0.2.weight', 'depth_pro.encoder.upsample_intermediate.1.upsample_blocks.1.weight'),
+        ('encoder.upsample_latent0.3.weight', 'depth_pro.encoder.upsample_intermediate.1.upsample_blocks.2.weight'),
+        ('encoder.upsample_latent1.0.weight', 'depth_pro.encoder.upsample_intermediate.0.proj.weight'),
+        ('encoder.upsample_latent1.1.weight', 'depth_pro.encoder.upsample_intermediate.0.upsample_blocks.0.weight'),
+        ('encoder.upsample_latent1.2.weight', 'depth_pro.encoder.upsample_intermediate.0.upsample_blocks.1.weight'),
+        ('encoder.upsample0.0.weight', 'depth_pro.encoder.upsample_scaled_images.2.proj.weight'),
+        ('encoder.upsample0.1.weight', 'depth_pro.encoder.upsample_scaled_images.2.upsample_blocks.0.weight'),
+        ('encoder.upsample1.0.weight', 'depth_pro.encoder.upsample_scaled_images.1.proj.weight'),
+        ('encoder.upsample1.1.weight', 'depth_pro.encoder.upsample_scaled_images.1.upsample_blocks.0.weight'),
+        ('encoder.upsample2.0.weight', 'depth_pro.encoder.upsample_scaled_images.0.proj.weight'),
+        ('encoder.upsample2.1.weight', 'depth_pro.encoder.upsample_scaled_images.0.upsample_blocks.0.weight'),
+        ('encoder.upsample_lowres.weight', 'depth_pro.encoder.upsample_image.upsample_blocks.0.weight'),
+        ('encoder.upsample_lowres.bias', 'depth_pro.encoder.upsample_image.upsample_blocks.0.bias'),
+
+        # neck
+        ("fov.downsample.0.weight", "fov_model.global_neck.0.weight"),
+        ("fov.downsample.0.bias", "fov_model.global_neck.0.bias"),
+        ("fov.encoder.1.weight", "fov_model.encoder_neck.weight"),
+        ("fov.encoder.1.bias", "fov_model.encoder_neck.bias"),
+    ]
+    for src, dest in mapping:
+        state_dict[dest] = state_dict.pop(src)
+    
+    return state_dict
+
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+    return image
+
+
+
+@torch.no_grad()
+def convert_depth_pro_checkpoint(repo_id, filename, pytorch_dump_folder_path, push_to_hub=False):
+    """
+    Copy/paste/tweak model's weights to our DepthPro structure.
+    """
+
+    # define default DepthPro configuration
+    config = DepthProConfig()
+
+    # load original weights from huggingface hub
+    # TODO: download from hub
+    # file_path = hf_hub_download(repo_id, filename)
+    file_path = "/home/geetu/work/hf/depth_pro/depth_pro.pt"
+    state_dict = torch.load(file_path, weights_only=True)
+
+    # enumerate fusion layers
+    n_scaled_images = len(config.scaled_images_ratios)       # 3
+    n_intermediate_hooks = len(config.intermediate_hook_ids) # 2
+    n_fusion_layers = n_scaled_images + n_intermediate_hooks # 5
+
+    # 1. keys for vit encoders
+    vit_rename_keys = create_vit_rename_keys(config)
+    for src_prefix, dest_prefix in [
+        ("encoder.patch_encoder", "depth_pro.encoder.patch_encoder"),
+        ("encoder.image_encoder", "depth_pro.encoder.image_encoder"),
+        ("fov.encoder.0", "fov_model.encoder"),
+    ]:
+        for src, dest in vit_rename_keys:
+            src = src_prefix + "." + src
+            dest = dest_prefix + "." + dest
+            state_dict[dest] = state_dict.pop(src)
+
+    # 2. qkv keys for vit encoders
+    state_dict = read_in_q_k_v(state_dict, config)
+
+    # 3. hard coded mapping
+    state_dict = update_hard_coded_keys(state_dict)
+
+
+    for key in list(state_dict.keys()):
+
+        # 4. final depth estimation head
+        if key.startswith("head."):
+            new_key = "head." + key
+
+        # 5. fov model head
+        elif key.startswith("fov.head."):
+            new_key = key.replace("fov", 'fov_model')
+
+        # 6. projections between encoder and fusion
+        elif "decoder.convs." in key:
+            n = re.findall(r'\d+', key)[0] # find digit inside string
+            n = n_fusion_layers - int(n) - 1
+            new_key = f"projections.{n}.weight"
+
+        # 7. fuse low res with image features
+        elif "encoder.fuse_lowres." in key:
+            new_key = key.replace("encoder.fuse_lowres", "depth_pro.encoder.fuse_image_with_low_res")
+
+        # 8. fusion stage (decoder)
+        elif key.startswith("decoder.fusions."):
+            new_key = key.replace("decoder.fusions.", "fusion_stage.layers.")
+            new_key = new_key.replace("resnet1", "residual_layer1")
+            new_key = new_key.replace("resnet2", "residual_layer2")
+            new_key = new_key.replace("residual.1", "convolution1")
+            new_key = new_key.replace("residual.3", "convolution2")
+            new_key = new_key.replace("out_conv", "projection")
+
+            n_with_dots = re.findall(r'.\d+.', new_key)[0] # find digit inside string followed by .
+            n = n_with_dots[1:-1]
+            n = n_fusion_layers - int(n) - 1
+            new_key = new_key.replace(n_with_dots, f".{n}.")
+
+        else:
+            continue
+
+        state_dict[new_key] = state_dict.pop(key)        
+
+    model = DepthProForDepthEstimation(config, use_fov_model=True).eval()
+    model.load_state_dict(state_dict)
+
+    exit()
+
+    # ----------------
+
+    
+
+    for key, val in state_dict.copy().items():
+        val = state_dict.pop(key)
+        if "w12" in key:
+            key = key.replace("w12", "weights_in")
+        if "w3" in key:
+            key = key.replace("w3", "weights_out")
+        state_dict[key] = val
+
+    # load HuggingFace model
+    if image_classifier:
+        model = Dinov2ForImageClassification(config).eval()
+        model.dinov2.load_state_dict(state_dict)
+        model_name_to_classifier_dict_url = {
+            "dinov2_vits14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_linear_head.pth",
+            "dinov2_vitb14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_linear_head.pth",
+            "dinov2_vitl14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_linear_head.pth",
+            "dinov2_vitg14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_linear_head.pth",
+        }
+        url = model_name_to_classifier_dict_url[model_name]
+        classifier_state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu")
+        model.classifier.weight = nn.Parameter(classifier_state_dict["weight"])
+        model.classifier.bias = nn.Parameter(classifier_state_dict["bias"])
+    else:
+        model = Dinov2Model(config).eval()
+        model.load_state_dict(state_dict)
+
+    # load image
+    image = prepare_img()
+
+    # preprocess image
+    transformations = transforms.Compose(
+        [
+            transforms.Resize(256, interpolation=transforms.InterpolationMode.BICUBIC),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            transforms.Normalize(
+                mean=IMAGENET_DEFAULT_MEAN,  # these are RGB mean+std values
+                std=IMAGENET_DEFAULT_STD,  # across a large photo dataset.
+            ),
+        ]
+    )
+
+    original_pixel_values = transformations(image).unsqueeze(0)  # insert batch dimension
+
+    processor = BitImageProcessor(
+        size={"shortest_edge": 256},
+        resample=PILImageResampling.BICUBIC,
+        image_mean=IMAGENET_DEFAULT_MEAN,
+        image_std=IMAGENET_DEFAULT_STD,
+    )
+    pixel_values = processor(image, return_tensors="pt").pixel_values
+
+    assert torch.allclose(original_pixel_values, pixel_values)
+
+    with torch.no_grad():
+        outputs = model(pixel_values, output_hidden_states=True)
+        original_outputs = original_model(pixel_values)
+
+    # assert values
+    if image_classifier:
+        print("Predicted class:")
+        class_idx = outputs.logits.argmax(-1).item()
+        print(model.config.id2label[class_idx])
+    else:
+        assert outputs.last_hidden_state[:, 0].shape == original_outputs.shape
+        assert torch.allclose(outputs.last_hidden_state[:, 0], original_outputs, atol=1e-3)
+    print("Looks ok!")
+
+    if pytorch_dump_folder_path is not None:
+        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+        print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
+        model.save_pretrained(pytorch_dump_folder_path)
+        print(f"Saving image processor to {pytorch_dump_folder_path}")
+        processor.save_pretrained(pytorch_dump_folder_path)
+
+    if push_to_hub:
+        model_name_to_hf_name = {
+            "dinov2_vits14": "dinov2-small",
+            "dinov2_vitb14": "dinov2-base",
+            "dinov2_vitl14": "dinov2-large",
+            "dinov2_vitg14": "dinov2-giant",
+            "dinov2_vits14_1layer": "dinov2-small-imagenet1k-1-layer",
+            "dinov2_vitb14_1layer": "dinov2-base-imagenet1k-1-layer",
+            "dinov2_vitl14_1layer": "dinov2-large-imagenet1k-1-layer",
+            "dinov2_vitg14_1layer": "dinov2-giant-imagenet1k-1-layer",
+        }
+
+        name = model_name_to_hf_name[model_name]
+        model.push_to_hub(f"facebook/{name}")
+        processor.push_to_hub(f"facebook/{name}")
+
+
+convert_depth_pro_checkpoint("apple/DepthPro", "depth_pro.pt", "yooo_torch_dump", False)
+exit()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--repo_id", default="apple/DepthPro", type=str, help="Name of the repo from huggingface you'd like to convert."
+    )
+    parser.add_argument(
+        "--filename", default="depth_pro.pt", type=str, help="Name of the file from repo you'd like to convert."
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
+    )
+    parser.add_argument(
+        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
+    )
+
+    args = parser.parse_args()
+    convert_depth_pro_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)

From 7d323ce91f071cc5ed6b0c36f407866e545dbe65 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Mon, 25 Nov 2024 16:41:13 +0500
Subject: [PATCH 018/151] fix fov squeeze

---
 src/transformers/models/depth_pro/modeling_depth_pro.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 0ac35b582d7f..eb8bf02f83d1 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -1294,7 +1294,7 @@ def forward(
 
         last_hidden_state = last_hidden_state + global_features
         fov_output = self.head(last_hidden_state)
-        fov_output = fov_output.reshape(1)
+        fov_output = fov_output.reshape(B)
 
         if not return_dict:
             head_outputs = (fov_output,)

From 6aaa59e943c5d5fd5c301404aaa47e8db1402355 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Mon, 25 Nov 2024 16:42:18 +0500
Subject: [PATCH 019/151] update conversion script (without test)

---
 .../depth_pro/convert_depth_pro_to_hf.py      | 160 +++++++-----------
 1 file changed, 59 insertions(+), 101 deletions(-)

diff --git a/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py
index 38b7a7853d76..de7bf395a355 100644
--- a/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py
+++ b/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py
@@ -83,6 +83,7 @@ def create_vit_rename_keys(config):
     # fmt: on
     return rename_keys
 
+
 # we split up the matrix of each encoder layer into queries, keys and values
 def read_in_q_k_v(state_dict, config):
     state_dict_keys = state_dict.keys()
@@ -102,6 +103,7 @@ def read_in_q_k_v(state_dict, config):
             state_dict[key.replace("attn.qkv", "attention.attention.value")] = v
     return state_dict
 
+
 # hard coded upsample keys
 def update_hard_coded_keys(state_dict):
     mapping = [
@@ -134,13 +136,24 @@ def update_hard_coded_keys(state_dict):
     return state_dict
 
 
-
 # We will verify our results on an image of cute cats
-def prepare_img():
+def inference_test(processor, model):
     url = "http://images.cocodataset.org/val2017/000000039769.jpg"
     image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
-    return image
 
+    inputs = processor(image)
+    with torch.no_grad():
+        outputs = model(**inputs)
+
+    predicted_depth = outputs.predicted_depth
+    fov = outputs.fov
+
+    predicted_depth, fov = processor.post_process_depth_estimation(predicted_depth, fov)
+
+    print("predicted_depth.shape:", predicted_depth.shape)
+    print("fov.shape:", fov.shape)
+    print("fov:", fov)
+    print("Inference was Successfull!")
 
 
 @torch.no_grad()
@@ -150,12 +163,10 @@ def convert_depth_pro_checkpoint(repo_id, filename, pytorch_dump_folder_path, pu
     """
 
     # define default DepthPro configuration
-    config = DepthProConfig()
+    config = DepthProConfig(use_fov_model=True)
 
     # load original weights from huggingface hub
-    # TODO: download from hub
-    # file_path = hf_hub_download(repo_id, filename)
-    file_path = "/home/geetu/work/hf/depth_pro/depth_pro.pt"
+    file_path = hf_hub_download(repo_id, filename)
     state_dict = torch.load(file_path, weights_only=True)
 
     # enumerate fusion layers
@@ -224,108 +235,50 @@ def convert_depth_pro_checkpoint(repo_id, filename, pytorch_dump_folder_path, pu
     model = DepthProForDepthEstimation(config, use_fov_model=True).eval()
     model.load_state_dict(state_dict)
 
-    exit()
-
-    # ----------------
+    # TODO
+    processor = ...
+    # inference_test(processor, model)
 
-    
+    if pytorch_dump_folder_path is not None:
+        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+        print(f"Saving model to {pytorch_dump_folder_path}")
+        model.save_pretrained(pytorch_dump_folder_path)
+        # TODO
+        # print(f"Saving image processor to {pytorch_dump_folder_path}")
+        # processor.save_pretrained(pytorch_dump_folder_path)
 
-    for key, val in state_dict.copy().items():
-        val = state_dict.pop(key)
-        if "w12" in key:
-            key = key.replace("w12", "weights_in")
-        if "w3" in key:
-            key = key.replace("w3", "weights_out")
-        state_dict[key] = val
-
-    # load HuggingFace model
-    if image_classifier:
-        model = Dinov2ForImageClassification(config).eval()
-        model.dinov2.load_state_dict(state_dict)
-        model_name_to_classifier_dict_url = {
-            "dinov2_vits14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_linear_head.pth",
-            "dinov2_vitb14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_linear_head.pth",
-            "dinov2_vitl14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_linear_head.pth",
-            "dinov2_vitg14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_linear_head.pth",
-        }
-        url = model_name_to_classifier_dict_url[model_name]
-        classifier_state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu")
-        model.classifier.weight = nn.Parameter(classifier_state_dict["weight"])
-        model.classifier.bias = nn.Parameter(classifier_state_dict["bias"])
-    else:
-        model = Dinov2Model(config).eval()
-        model.load_state_dict(state_dict)
-
-    # load image
-    image = prepare_img()
-
-    # preprocess image
-    transformations = transforms.Compose(
-        [
-            transforms.Resize(256, interpolation=transforms.InterpolationMode.BICUBIC),
-            transforms.CenterCrop(224),
-            transforms.ToTensor(),
-            transforms.Normalize(
-                mean=IMAGENET_DEFAULT_MEAN,  # these are RGB mean+std values
-                std=IMAGENET_DEFAULT_STD,  # across a large photo dataset.
-            ),
-        ]
-    )
 
-    original_pixel_values = transformations(image).unsqueeze(0)  # insert batch dimension
+    # TODO
+    # if push_to_hub:
+    #     model.push_to_hub("...")
+    #     processor.push_to_hub("...")
 
-    processor = BitImageProcessor(
-        size={"shortest_edge": 256},
-        resample=PILImageResampling.BICUBIC,
-        image_mean=IMAGENET_DEFAULT_MEAN,
-        image_std=IMAGENET_DEFAULT_STD,
-    )
-    pixel_values = processor(image, return_tensors="pt").pixel_values
 
-    assert torch.allclose(original_pixel_values, pixel_values)
+"""
+- create files locally using function
+```py
+convert_depth_pro_checkpoint(
+    "apple/DepthPro",
+    "depth_pro.pt",
+    "my_local_dump",
+    False,
+)
+```
+
+- create files locally using command line args
+```cmd
+python transformers/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py \
+    --repo_id "apple/DepthPro" \
+    --filename "depth_pro.pt" \
+    --pytorch_dump_folder_path "my_local_dump" \
+    --push_to_hub 0
+```
+"""
 
-    with torch.no_grad():
-        outputs = model(pixel_values, output_hidden_states=True)
-        original_outputs = original_model(pixel_values)
-
-    # assert values
-    if image_classifier:
-        print("Predicted class:")
-        class_idx = outputs.logits.argmax(-1).item()
-        print(model.config.id2label[class_idx])
-    else:
-        assert outputs.last_hidden_state[:, 0].shape == original_outputs.shape
-        assert torch.allclose(outputs.last_hidden_state[:, 0], original_outputs, atol=1e-3)
-    print("Looks ok!")
 
-    if pytorch_dump_folder_path is not None:
-        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        print(f"Saving image processor to {pytorch_dump_folder_path}")
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        model_name_to_hf_name = {
-            "dinov2_vits14": "dinov2-small",
-            "dinov2_vitb14": "dinov2-base",
-            "dinov2_vitl14": "dinov2-large",
-            "dinov2_vitg14": "dinov2-giant",
-            "dinov2_vits14_1layer": "dinov2-small-imagenet1k-1-layer",
-            "dinov2_vitb14_1layer": "dinov2-base-imagenet1k-1-layer",
-            "dinov2_vitl14_1layer": "dinov2-large-imagenet1k-1-layer",
-            "dinov2_vitg14_1layer": "dinov2-giant-imagenet1k-1-layer",
-        }
-
-        name = model_name_to_hf_name[model_name]
-        model.push_to_hub(f"facebook/{name}")
-        processor.push_to_hub(f"facebook/{name}")
-
-
-convert_depth_pro_checkpoint("apple/DepthPro", "depth_pro.pt", "yooo_torch_dump", False)
-exit()
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
+
     # Required parameters
     parser.add_argument(
         "--repo_id", default="apple/DepthPro", type=str, help="Name of the repo from huggingface you'd like to convert."
@@ -341,4 +294,9 @@ def convert_depth_pro_checkpoint(repo_id, filename, pytorch_dump_folder_path, pu
     )
 
     args = parser.parse_args()
-    convert_depth_pro_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
+    convert_depth_pro_checkpoint(
+        args.repo_id,
+        args.filename,
+        args.pytorch_dump_folder_path,
+        args.push_to_hub,
+    )

From 263b773db7ac897a6a610e15a3fc5be0b79615da Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Mon, 25 Nov 2024 16:47:17 +0500
Subject: [PATCH 020/151] upload ruff image processing

---
 .../depth_pro/image_processing_depth_pro.py   | 397 ++++++++++++++++++
 1 file changed, 397 insertions(+)
 create mode 100644 src/transformers/models/depth_pro/image_processing_depth_pro.py

diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py
new file mode 100644
index 000000000000..883c50ebfe6f
--- /dev/null
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py
@@ -0,0 +1,397 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for DepthPro."""
+
+from typing import Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union
+
+import numpy as np
+from icecream import ic
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import resize, to_channel_dimension_format
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from ...utils import TensorType, filter_out_non_signature_kwargs, logging
+
+import math
+from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union
+
+
+if TYPE_CHECKING:
+    from ...modeling_outputs import DepthEstimatorOutput
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import pad, resize, to_channel_dimension_format
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    is_torch_available,
+    is_torch_tensor,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from ...utils import (
+    TensorType,
+    filter_out_non_signature_kwargs,
+    is_vision_available,
+    logging,
+    requires_backends,
+)
+
+from transformers.models.depth_pro.modeling_depth_pro import DepthProDepthEstimatorOutput
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    import PIL
+
+
+logger = logging.get_logger(__name__)
+
+
+class DepthProImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a DepthPro image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `(size["height"],
+            size["width"])`. Can be overridden by the `do_resize` parameter in the `preprocess` method.
+        size (`dict`, *optional*, defaults to `{"height": 1536, "width": 1536}`):
+            Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
+            method.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
+            Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
+            `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
+            parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
+            `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Optional[Dict[str, int]] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 1536, "width": 1536}
+        size = get_size_dict(size)
+        self.do_resize = do_resize
+        self.do_rescale = do_rescale
+        self.do_normalize = do_normalize
+        self.size = size
+        self.resample = resample
+        self.rescale_factor = rescale_factor
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image to `(size["height"], size["width"])`.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+
+        Returns:
+            `np.ndarray`: The resized image.
+        """
+        size = get_size_dict(size)
+        if "height" not in size or "width" not in size:
+            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
+        output_size = (size["height"], size["width"])
+
+        # ic(image.dtype)
+        # ic(type(image))
+        # ic(image.shape)
+        # ic(image.mean())
+        # ic(image.std())
+        # ic(image.min())
+        # ic(image.max())
+        # ic(output_size)
+        # ic(resample)
+        # ic(data_format)
+        # ic(input_data_format)
+        # # exit()
+
+        # return torch.nn.functional.interpolate(
+        #     input=torch.from_numpy(image),
+        #     size=output_size,
+        #     mode=resample,
+        #     align_corners=True,
+        # )
+
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    @filter_out_non_signature_kwargs()
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Dictionary in the format `{"height": h, "width": w}` specifying the size of the output image after
+                resizing.
+            resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`):
+                `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has
+                an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use if `do_normalize` is set to `True`.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        resample = resample if resample is not None else self.resample
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+
+        size = size if size is not None else self.size
+        size_dict = get_size_dict(size)
+
+        images = make_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        # TODO
+        # depth-pro image preprocessing scales the image before resizing it
+
+        if do_resize:
+            images = [
+                self.resize(image=image, size=size_dict, resample=resample, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        if do_rescale:
+            images = [
+                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        if do_normalize:
+            images = [
+                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
+
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+    def post_process_depth_estimation(
+        self,
+        predicted_depth,
+        fov=None,
+    ) -> List[Dict[str, TensorType]]:
+        """
+        Converts the raw output of [`DepthEstimatorOutput`] into final depth predictions and depth PIL images.
+        Only supports PyTorch.
+
+        Args:
+            outputs ([`DepthEstimatorOutput`]):
+                Raw outputs of the model.
+            target_sizes (`TensorType` or `List[Tuple[int, int]]`, *optional*):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
+                (height, width) of each image in the batch. If left to None, predictions will not be resized.
+
+        Returns:
+            `List[Dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth
+            predictions.
+        """
+        requires_backends(self, "torch")
+
+        self.size = {
+            'width': 3024,
+            'height': 2268,
+        }
+        W = self.size['width']
+        H = self.size['height']
+
+        if (fov is not None) and (len(predicted_depth) != len(fov)):
+            raise ValueError(
+                "Make sure that you pass in as many fov values as the batch dimension of the predicted depth"
+            )
+
+        output_depths = []
+        output_fovs = None if fov is None else []
+        fov = [None] * len(predicted_depth) if fov is None else fov
+        for depth, fov_value in zip(predicted_depth, fov):
+
+            if fov_value is not None:
+                fov_value = 0.5 * W / torch.tan(0.5 * torch.deg2rad(fov_value))
+                depth = depth * W / fov_value
+
+            depth = torch.nn.functional.interpolate(
+                depth.unsqueeze(0).unsqueeze(1), size=(H, W), mode="bilinear", align_corners=False
+            ).squeeze()
+
+            if fov_value is not None:
+                depth = 1.0 / torch.clamp(depth, min=1e-4, max=1e4)
+                output_fovs.append(fov_value)
+
+            output_depths.append(depth)
+
+        return output_depths, output_fovs

From 17e5487ce6782998aaccb8a8799b9495d7d545bd Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Tue, 26 Nov 2024 09:35:52 +0500
Subject: [PATCH 021/151] create fast image processing

---
 .../image_processing_depth_pro_fast.py        | 362 ++++++++++++++++++
 1 file changed, 362 insertions(+)
 create mode 100644 src/transformers/models/depth_pro/image_processing_depth_pro_fast.py

diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
new file mode 100644
index 000000000000..8860f2e86830
--- /dev/null
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
@@ -0,0 +1,362 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Image processor class for DepthPro."""
+
+import functools
+from typing import Dict, List, Optional, Union
+
+from ...image_processing_base import BatchFeature
+from ...image_processing_utils import get_size_dict
+from ...image_processing_utils_fast import BaseImageProcessorFast, SizeDict
+from ...image_transforms import FusedRescaleNormalize, NumpyToTensor, Rescale
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    ImageInput,
+    ImageType,
+    PILImageResampling,
+    get_image_type,
+    make_list_of_images,
+    pil_torch_interpolation_mapping,
+)
+from ...utils import TensorType, logging, requires_backends
+from ...utils.import_utils import is_torch_available, is_torchvision_available
+
+logger = logging.get_logger(__name__)
+
+
+if is_torch_available():
+    import torch
+
+
+if is_torchvision_available():
+    from torchvision.transforms import Compose, Normalize, PILToTensor, Resize
+
+
+class DepthProImageProcessorFast(BaseImageProcessorFast):
+    r"""
+    Constructs a DepthPro image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `(size["height"],
+            size["width"])`. Can be overridden by the `do_resize` parameter in the `preprocess` method.
+        size (`dict`, *optional*, defaults to `{"height": 1536, "width": 1536}`):
+            Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
+            method.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
+            Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
+            `preprocess` method.
+        antialias (`bool`, *optional*, defaults to `False`):
+            Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with
+            bilinear or bicubic modes and it is ignored otherwise.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
+            parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
+            `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+    """
+
+    model_input_names = ["pixel_values"]
+    _transform_params = [
+        "do_resize",
+        "do_rescale",
+        "do_normalize",
+        "size",
+        "resample",
+        "antialias",
+        "rescale_factor",
+        "image_mean",
+        "image_std",
+        "image_type",
+    ]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Optional[Dict[str, int]] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        antialias: bool = False,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 1536, "width": 1536}
+        size = get_size_dict(size)
+        self.do_resize = do_resize
+        self.do_rescale = do_rescale
+        self.do_normalize = do_normalize
+        self.size = size
+        self.resample = resample
+        self.antialias = antialias
+        self.rescale_factor = rescale_factor
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+
+    def _build_transforms(
+        self,
+        do_resize: bool,
+        size: Dict[str, int],
+        resample: PILImageResampling,
+        antialias: bool,
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Union[float, List[float]],
+        image_std: Union[float, List[float]],
+        image_type: ImageType,
+    ) -> "Compose":
+        """
+        Given the input settings build the image transforms using `torchvision.transforms.Compose`.
+        """
+        transforms = []
+
+        # All PIL and numpy values need to be converted to a torch tensor
+        # to keep cross compatibility with slow image processors
+        if image_type == ImageType.PIL:
+            transforms.append(PILToTensor())
+
+        elif image_type == ImageType.NUMPY:
+            transforms.append(NumpyToTensor())
+
+        # We can combine rescale and normalize into a single operation for speed
+        if do_rescale and do_normalize:
+            transforms.append(FusedRescaleNormalize(image_mean, image_std, rescale_factor=rescale_factor))
+        elif do_rescale:
+            transforms.append(Rescale(rescale_factor=rescale_factor))
+        elif do_normalize:
+            transforms.append(Normalize(image_mean, image_std))
+
+		# depth-pro scales the image before resizing it
+        if do_resize:
+            transforms.append(
+                Resize(
+                    (size["height"], size["width"]),
+                    interpolation=pil_torch_interpolation_mapping[resample],
+                    antialias=antialias
+                )
+            )
+
+        return Compose(transforms)
+
+    @functools.lru_cache(maxsize=1)
+    def _validate_input_arguments(
+        self,
+        return_tensors: Union[str, TensorType],
+        do_resize: bool,
+        size: Dict[str, int],
+        resample: PILImageResampling,
+        antialias: bool,
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Union[float, List[float]],
+        image_std: Union[float, List[float]],
+        data_format: Union[str, ChannelDimension],
+        image_type: ImageType,
+    ):
+        if return_tensors != "pt":
+            raise ValueError("Only returning PyTorch tensors is currently supported.")
+
+        if data_format != ChannelDimension.FIRST:
+            raise ValueError("Only channel first data format is currently supported.")
+
+        if do_resize and None in (size, resample, antialias):
+            raise ValueError("Size, resample and antialias must be specified if do_resize is True.")
+
+        if do_rescale and rescale_factor is None:
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
+
+        if do_normalize and None in (image_mean, image_std):
+            raise ValueError("Image mean and standard deviation must be specified if do_normalize is True.")
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        antialias: Optional[bool] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = "pt",
+        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ):
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Dictionary in the format `{"height": h, "width": w}` specifying the size of the output image after
+                resizing.
+            resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`):
+                `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has
+                an effect if `do_resize` is set to `True`.
+			antialias (`bool`, *optional*, defaults to `False`):
+				Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with
+				bilinear or bicubic modes and it is ignored otherwise.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use if `do_normalize` is set to `True`.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Only "pt" is supported
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. The following formats are currently supported:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        resample = resample if resample is not None else self.resample
+        antialias = antialias if antialias is not None else self.antialias
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        size = size if size is not None else self.size
+        # Make hashable for cache
+        size = SizeDict(**size)
+        image_mean = tuple(image_mean) if isinstance(image_mean, list) else image_mean
+        image_std = tuple(image_std) if isinstance(image_std, list) else image_std
+
+        images = make_list_of_images(images)
+        image_type = get_image_type(images[0])
+
+        if image_type not in [ImageType.PIL, ImageType.TORCH, ImageType.NUMPY]:
+            raise ValueError(f"Unsupported input image type {image_type}")
+
+        self._validate_input_arguments(
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+            antialias=antialias,
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            return_tensors=return_tensors,
+            data_format=data_format,
+            image_type=image_type,
+        )
+
+        transforms = self.get_transforms(
+            do_resize=do_resize,
+            do_rescale=do_rescale,
+            do_normalize=do_normalize,
+            size=size,
+            resample=resample,
+            antialias=antialias,
+            rescale_factor=rescale_factor,
+            image_mean=image_mean,
+            image_std=image_std,
+            image_type=image_type,
+        )
+        transformed_images = [transforms(image) for image in images]
+
+        data = {"pixel_values": torch.stack(transformed_images, dim=0)}
+        return BatchFeature(data, tensor_type=return_tensors)
+
+    def post_process_depth_estimation(
+        self,
+        predicted_depth,
+        fov=None,
+    ) -> List[Dict[str, TensorType]]:
+        """
+        Converts the raw output of [`DepthEstimatorOutput`] into final depth predictions and depth PIL images.
+        Only supports PyTorch.
+
+        Args:
+            outputs ([`DepthEstimatorOutput`]):
+                Raw outputs of the model.
+            target_sizes (`TensorType` or `List[Tuple[int, int]]`, *optional*):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
+                (height, width) of each image in the batch. If left to None, predictions will not be resized.
+
+        Returns:
+            `List[Dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth
+            predictions.
+        """
+        requires_backends(self, "torch")
+
+        self.size = {
+            'width': 3024,
+            'height': 2268,
+        }
+        W = self.size['width']
+        H = self.size['height']
+
+        if (fov is not None) and (len(predicted_depth) != len(fov)):
+            raise ValueError(
+                "Make sure that you pass in as many fov values as the batch dimension of the predicted depth"
+            )
+
+        output_depths = []
+        output_fovs = None if fov is None else []
+        fov = [None] * len(predicted_depth) if fov is None else fov
+        for depth, fov_value in zip(predicted_depth, fov):
+
+            if fov_value is not None:
+                fov_value = 0.5 * W / torch.tan(0.5 * torch.deg2rad(fov_value))
+                depth = depth * W / fov_value
+
+            depth = torch.nn.functional.interpolate(
+                depth.unsqueeze(0).unsqueeze(1), size=(H, W), mode="bilinear", align_corners=False
+            ).squeeze()
+
+            if fov_value is not None:
+                depth = 1.0 / torch.clamp(depth, min=1e-4, max=1e4)
+                output_fovs.append(fov_value)
+
+            output_depths.append(depth)
+
+        return output_depths, output_fovs

From a8dd7049a5e2683a06f8d8df4cb7d22673d35b4b Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Tue, 26 Nov 2024 10:42:36 +0500
Subject: [PATCH 022/151] use torch interpolation for image processing

---
 .../depth_pro/image_processing_depth_pro.py   | 112 +++++++++++-------
 1 file changed, 66 insertions(+), 46 deletions(-)

diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py
index 883c50ebfe6f..d8b9ff493b1a 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 """Image processor class for DepthPro."""
 
+import functools
 from typing import Dict, List, Optional, Union
 from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union
 
@@ -33,7 +34,7 @@
     make_list_of_images,
     to_numpy_array,
     valid_images,
-    validate_preprocess_arguments,
+    pil_torch_interpolation_mapping,
 )
 from ...utils import TensorType, filter_out_non_signature_kwargs, logging
 
@@ -62,7 +63,6 @@
     make_list_of_images,
     to_numpy_array,
     valid_images,
-    validate_preprocess_arguments,
 )
 from ...utils import (
     TensorType,
@@ -99,6 +99,9 @@ class DepthProImageProcessor(BaseImageProcessor):
         resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
             Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
             `preprocess` method.
+        antialias (`bool`, *optional*, defaults to `False`):
+            Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with
+            bilinear or bicubic modes and it is ignored otherwise.
         do_rescale (`bool`, *optional*, defaults to `True`):
             Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
             parameter in the `preprocess` method.
@@ -123,6 +126,7 @@ def __init__(
         do_resize: bool = True,
         size: Optional[Dict[str, int]] = None,
         resample: PILImageResampling = PILImageResampling.BILINEAR,
+        antialias: bool = False,
         do_rescale: bool = True,
         rescale_factor: Union[int, float] = 1 / 255,
         do_normalize: bool = True,
@@ -138,15 +142,17 @@ def __init__(
         self.do_normalize = do_normalize
         self.size = size
         self.resample = resample
+        self.antialias = antialias
         self.rescale_factor = rescale_factor
         self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
         self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
 
     def resize(
         self,
-        image: np.ndarray,
+        images: List[np.ndarray],
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BILINEAR,
+        antialias: bool = False,
         data_format: Optional[Union[str, ChannelDimension]] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
         **kwargs,
@@ -155,12 +161,15 @@ def resize(
         Resize an image to `(size["height"], size["width"])`.
 
         Args:
-            image (`np.ndarray`):
-                Image to resize.
+            images (`List[np.ndarray]`):
+                Images to resize.
             size (`Dict[str, int]`):
                 Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
             resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
                 `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
+            antialias (`bool`, *optional*, defaults to `False`):
+				Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with
+				bilinear or bicubic modes and it is ignored otherwise.
             data_format (`ChannelDimension` or `str`, *optional*):
                 The channel dimension format for the output image. If unset, the channel dimension format of the input
                 image is used. Can be one of:
@@ -175,41 +184,49 @@ def resize(
                 - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
 
         Returns:
-            `np.ndarray`: The resized image.
+            `np.ndarray`: The resized images.
         """
         size = get_size_dict(size)
         if "height" not in size or "width" not in size:
             raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
         output_size = (size["height"], size["width"])
 
-        # ic(image.dtype)
-        # ic(type(image))
-        # ic(image.shape)
-        # ic(image.mean())
-        # ic(image.std())
-        # ic(image.min())
-        # ic(image.max())
-        # ic(output_size)
-        # ic(resample)
-        # ic(data_format)
-        # ic(input_data_format)
-        # # exit()
-
-        # return torch.nn.functional.interpolate(
-        #     input=torch.from_numpy(image),
-        #     size=output_size,
-        #     mode=resample,
-        #     align_corners=True,
-        # )
-
-        return resize(
-            image,
+        images = np.stack(images)
+        images = torch.from_numpy(images)
+
+        return torch.nn.functional.interpolate(
+            # input should be (B, C, H, W)
+            input=images,
             size=output_size,
-            resample=resample,
-            data_format=data_format,
-            input_data_format=input_data_format,
-            **kwargs,
-        )
+            # mode=pil_torch_interpolation_mapping[resample],
+            mode="bilinear",
+            antialias=antialias,
+        ).numpy()
+
+    def _validate_input_arguments(
+        self,
+        do_resize: bool,
+        size: Dict[str, int],
+        resample: PILImageResampling,
+        antialias: bool,
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Union[float, List[float]],
+        image_std: Union[float, List[float]],
+        data_format: Union[str, ChannelDimension],
+    ):
+        if data_format != ChannelDimension.FIRST:
+            raise ValueError("Only channel first data format is currently supported.")
+
+        if do_resize and None in (size, resample, antialias):
+            raise ValueError("Size, resample and antialias must be specified if do_resize is True.")
+
+        if do_rescale and rescale_factor is None:
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
+
+        if do_normalize and None in (image_mean, image_std):
+            raise ValueError("Image mean and standard deviation must be specified if do_normalize is True.")
 
     @filter_out_non_signature_kwargs()
     def preprocess(
@@ -218,6 +235,7 @@ def preprocess(
         do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         resample: PILImageResampling = None,
+        antialias: Optional[bool] = None,
         do_rescale: Optional[bool] = None,
         rescale_factor: Optional[float] = None,
         do_normalize: Optional[bool] = None,
@@ -242,6 +260,9 @@ def preprocess(
             resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`):
                 `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has
                 an effect if `do_resize` is set to `True`.
+            antialias (`bool`, *optional*, defaults to `False`):
+				Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with
+				bilinear or bicubic modes and it is ignored otherwise.
             do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
                 Whether to rescale the image values between [0 - 1].
             rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
@@ -275,6 +296,7 @@ def preprocess(
         do_rescale = do_rescale if do_rescale is not None else self.do_rescale
         do_normalize = do_normalize if do_normalize is not None else self.do_normalize
         resample = resample if resample is not None else self.resample
+        antialias = antialias if antialias is not None else self.antialias
         rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
         image_mean = image_mean if image_mean is not None else self.image_mean
         image_std = image_std if image_std is not None else self.image_std
@@ -289,15 +311,17 @@ def preprocess(
                 "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
                 "torch.Tensor, tf.Tensor or jax.ndarray."
             )
-        validate_preprocess_arguments(
+        self._validate_input_arguments(
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+            antialias=antialias,
             do_rescale=do_rescale,
             rescale_factor=rescale_factor,
             do_normalize=do_normalize,
             image_mean=image_mean,
             image_std=image_std,
-            do_resize=do_resize,
-            size=size,
-            resample=resample,
+            data_format=data_format,
         )
 
         # All transformations expect numpy arrays.
@@ -313,15 +337,6 @@ def preprocess(
             # We assume that all images have the same channel dimension format.
             input_data_format = infer_channel_dimension_format(images[0])
 
-        # TODO
-        # depth-pro image preprocessing scales the image before resizing it
-
-        if do_resize:
-            images = [
-                self.resize(image=image, size=size_dict, resample=resample, input_data_format=input_data_format)
-                for image in images
-            ]
-
         if do_rescale:
             images = [
                 self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
@@ -338,6 +353,11 @@ def preprocess(
             to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
         ]
 
+		# depth-pro scales the image before resizing it
+        # uses torch interpolation which requires ChannelDimension.FIRST
+        if do_resize:
+            images = self.resize(images, size=size_dict, resample=resample, antialias=antialias)
+
         data = {"pixel_values": images}
         return BatchFeature(data=data, tensor_type=return_tensors)
 

From 261bbafe4fb65d3bfe344045d92c7ca67f05283f Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Tue, 26 Nov 2024 12:12:39 +0500
Subject: [PATCH 023/151] complete post_process_depth_estimation

---
 .../depth_pro/image_processing_depth_pro.py   | 71 +++++++++++--------
 .../image_processing_depth_pro_fast.py        | 70 ++++++++++--------
 2 files changed, 83 insertions(+), 58 deletions(-)

diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py
index d8b9ff493b1a..0a7313e2d19a 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py
@@ -14,13 +14,13 @@
 # limitations under the License.
 """Image processor class for DepthPro."""
 
-import functools
 from typing import Dict, List, Optional, Union
 from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union
 
 import numpy as np
 from icecream import ic
 
+
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
 from ...image_transforms import resize, to_channel_dimension_format
 from ...image_utils import (
@@ -186,6 +186,8 @@ def resize(
         Returns:
             `np.ndarray`: The resized images.
         """
+        requires_backends(self, "torch")
+
         size = get_size_dict(size)
         if "height" not in size or "width" not in size:
             raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
@@ -198,10 +200,9 @@ def resize(
             # input should be (B, C, H, W)
             input=images,
             size=output_size,
-            # mode=pil_torch_interpolation_mapping[resample],
-            mode="bilinear",
+            mode=pil_torch_interpolation_mapping[resample].value,
             antialias=antialias,
-        ).numpy()
+        )
 
     def _validate_input_arguments(
         self,
@@ -357,14 +358,16 @@ def preprocess(
         # uses torch interpolation which requires ChannelDimension.FIRST
         if do_resize:
             images = self.resize(images, size=size_dict, resample=resample, antialias=antialias)
+            images = images.numpy()
 
         data = {"pixel_values": images}
         return BatchFeature(data=data, tensor_type=return_tensors)
 
     def post_process_depth_estimation(
         self,
-        predicted_depth,
-        fov=None,
+        predicted_depths,
+        fovs=None,
+        target_sizes=None,
     ) -> List[Dict[str, TensorType]]:
         """
         Converts the raw output of [`DepthEstimatorOutput`] into final depth predictions and depth PIL images.
@@ -383,35 +386,45 @@ def post_process_depth_estimation(
         """
         requires_backends(self, "torch")
 
-        self.size = {
-            'width': 3024,
-            'height': 2268,
-        }
-        W = self.size['width']
-        H = self.size['height']
-
-        if (fov is not None) and (len(predicted_depth) != len(fov)):
+        if (fovs is not None) and (len(predicted_depths) != len(fovs)):
             raise ValueError(
                 "Make sure that you pass in as many fov values as the batch dimension of the predicted depth"
             )
+        if (target_sizes is not None) and (len(predicted_depths) != len(target_sizes)):
+            raise ValueError(
+                "Make sure that you pass in as many fov values as the batch dimension of the predicted depth"
+            )
+
+        outputs = {
+            "predicted_depth": [],
+            "fov": [] if fovs is not None else None
+        }
+
+        fovs = [None] * len(predicted_depths) if fovs is None else fovs
+        target_sizes = [None] * len(predicted_depths) if target_sizes is None else target_sizes
+
+        for predicted_depth, fov, target_size in zip(predicted_depths, fovs, target_sizes):
 
-        output_depths = []
-        output_fovs = None if fov is None else []
-        fov = [None] * len(predicted_depth) if fov is None else fov
-        for depth, fov_value in zip(predicted_depth, fov):
+            if target_size is not None:
 
-            if fov_value is not None:
-                fov_value = 0.5 * W / torch.tan(0.5 * torch.deg2rad(fov_value))
-                depth = depth * W / fov_value
+                # scale image w.r.t fov
+                if fov is not None:
+                    width = target_size[1]
+                    fov = 0.5 * width / torch.tan(0.5 * torch.deg2rad(fov))
+                    predicted_depth = predicted_depth * width / fov
+                    outputs["fov"].append(fov)
 
-            depth = torch.nn.functional.interpolate(
-                depth.unsqueeze(0).unsqueeze(1), size=(H, W), mode="bilinear", align_corners=False
-            ).squeeze()
+                # interpolate
+                predicted_depth = self.resize(
+                    predicted_depth.unsqueeze(0).unsqueeze(1),
+                    size=target_size,
+                    resample=self.resample,
+                    antialias=self.antialias
+                ).squeeze()
 
-            if fov_value is not None:
-                depth = 1.0 / torch.clamp(depth, min=1e-4, max=1e4)
-                output_fovs.append(fov_value)
+            # inverse the depth
+            predicted_depth = 1.0 / torch.clamp(predicted_depth, min=1e-4, max=1e4)
 
-            output_depths.append(depth)
+            outputs["predicted_depth"].append(predicted_depth)
 
-        return output_depths, output_fovs
+        return outputs
diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
index 8860f2e86830..38d699452e44 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
@@ -154,7 +154,7 @@ def _build_transforms(
         elif do_normalize:
             transforms.append(Normalize(image_mean, image_std))
 
-		# depth-pro scales the image before resizing it
+        # depth-pro scales the image before resizing it
         if do_resize:
             transforms.append(
                 Resize(
@@ -229,9 +229,9 @@ def preprocess(
             resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`):
                 `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has
                 an effect if `do_resize` is set to `True`.
-			antialias (`bool`, *optional*, defaults to `False`):
-				Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with
-				bilinear or bicubic modes and it is ignored otherwise.
+            antialias (`bool`, *optional*, defaults to `False`):
+                Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with
+                bilinear or bicubic modes and it is ignored otherwise.
             do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
                 Whether to rescale the image values between [0 - 1].
             rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
@@ -308,8 +308,9 @@ def preprocess(
 
     def post_process_depth_estimation(
         self,
-        predicted_depth,
-        fov=None,
+        predicted_depths,
+        fovs=None,
+        target_sizes=None,
     ) -> List[Dict[str, TensorType]]:
         """
         Converts the raw output of [`DepthEstimatorOutput`] into final depth predictions and depth PIL images.
@@ -328,35 +329,46 @@ def post_process_depth_estimation(
         """
         requires_backends(self, "torch")
 
-        self.size = {
-            'width': 3024,
-            'height': 2268,
-        }
-        W = self.size['width']
-        H = self.size['height']
-
-        if (fov is not None) and (len(predicted_depth) != len(fov)):
+        if (fovs is not None) and (len(predicted_depths) != len(fovs)):
+            raise ValueError(
+                "Make sure that you pass in as many fov values as the batch dimension of the predicted depth"
+            )
+        if (target_sizes is not None) and (len(predicted_depths) != len(target_sizes)):
             raise ValueError(
                 "Make sure that you pass in as many fov values as the batch dimension of the predicted depth"
             )
 
-        output_depths = []
-        output_fovs = None if fov is None else []
-        fov = [None] * len(predicted_depth) if fov is None else fov
-        for depth, fov_value in zip(predicted_depth, fov):
+        outputs = {
+            "predicted_depth": [],
+            "fov": [] if fovs is not None else None
+        }
+
+        fovs = [None] * len(predicted_depths) if fovs is None else fovs
+        target_sizes = [None] * len(predicted_depths) if target_sizes is None else target_sizes
+
+        for predicted_depth, fov, target_size in zip(predicted_depths, fovs, target_sizes):
+
+            if target_size is not None:
 
-            if fov_value is not None:
-                fov_value = 0.5 * W / torch.tan(0.5 * torch.deg2rad(fov_value))
-                depth = depth * W / fov_value
+                # scale image w.r.t fov
+                if fov is not None:
+                    width = target_size[1]
+                    fov = 0.5 * width / torch.tan(0.5 * torch.deg2rad(fov))
+                    predicted_depth = predicted_depth * width / fov
+                    outputs["fov"].append(fov)
 
-            depth = torch.nn.functional.interpolate(
-                depth.unsqueeze(0).unsqueeze(1), size=(H, W), mode="bilinear", align_corners=False
-            ).squeeze()
+                # interpolate
+                predicted_depth = torch.nn.functional.interpolate(
+                    # input should be (B, C, H, W)
+                    input=predicted_depth.unsqueeze(0).unsqueeze(1),
+                    size=target_size,
+                    mode=pil_torch_interpolation_mapping[self.resample].value,
+                    antialias=self.antialias,
+                ).squeeze()
 
-            if fov_value is not None:
-                depth = 1.0 / torch.clamp(depth, min=1e-4, max=1e4)
-                output_fovs.append(fov_value)
+            # inverse the depth
+            predicted_depth = 1.0 / torch.clamp(predicted_depth, min=1e-4, max=1e4)
 
-            output_depths.append(depth)
+            outputs["predicted_depth"].append(predicted_depth)
 
-        return output_depths, output_fovs
+        return outputs

From a4b3556c5f7ef738048df1b7de22dfa45c822b43 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Tue, 26 Nov 2024 16:36:19 +0500
Subject: [PATCH 024/151] config: fix imports and sort args

---
 .../depth_pro/configuration_depth_pro.py      | 49 +++++++++----------
 1 file changed, 23 insertions(+), 26 deletions(-)

diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py
index f124d3e5b71a..fae3e84432be 100644
--- a/src/transformers/models/depth_pro/configuration_depth_pro.py
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@@ -14,15 +14,8 @@
 # limitations under the License.
 """DepthPro model configuration"""
 
-from collections import OrderedDict
-from typing import Mapping
-
-from packaging import version
-
-from transformers.configuration_utils import PretrainedConfig
-from transformers.onnx import OnnxConfig
-from transformers.utils import logging
-from transformers.utils.backbone_utils import get_aligned_output_features_output_indices
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
 
 
 logger = logging.get_logger(__name__)
@@ -41,6 +34,8 @@ class DepthProConfig(PretrainedConfig):
     Args:
         hidden_size (`int`, *optional*, defaults to 1024):
             Dimensionality of the encoder layers and the pooler layer.
+        fusion_hidden_size
+            TODO
         num_hidden_layers (`int`, *optional*, defaults to 24):
             Number of hidden layers in the Transformer encoder.
         num_attention_heads (`int`, *optional*, defaults to 12):
@@ -65,6 +60,8 @@ class DepthProConfig(PretrainedConfig):
             The size (resolution) of each patch.
         num_channels (`int`, *optional*, defaults to 3):
             The number of input channels.
+        patch_embeddings_size
+            TODO
         qkv_bias (`bool`, *optional*, defaults to `True`):
             Whether to add a bias to the queries, keys and values.
         layerscale_value (`float`, *optional*, defaults to 1.0):
@@ -73,22 +70,28 @@ class DepthProConfig(PretrainedConfig):
             Stochastic depth rate per sample (when applied in the main path of residual layers).
         use_swiglu_ffn (`bool`, *optional*, defaults to `False`):
             Whether to use the SwiGLU feedforward neural network.
-        out_features (`List[str]`, *optional*):
-            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
-            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
-            corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the
-            same order as defined in the `stage_names` attribute.
-        out_indices (`List[int]`, *optional*):
-            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
-            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
-            If unset and `out_features` is unset, will default to the last stage. Must be in the
-            same order as defined in the `stage_names` attribute.
         apply_layernorm (`bool`, *optional*, defaults to `True`):
             Whether to apply layer normalization to the feature maps in case the model is used as backbone.
         reshape_hidden_states (`bool`, *optional*, defaults to `True`):
             Whether to reshape the feature maps to 4D tensors of shape `(batch_size, hidden_size, height, width)` in
             case the model is used as backbone. If `False`, the feature maps will be 3D tensors of shape `(batch_size,
             seq_len, hidden_size)`.
+        intermediate_hook_ids
+            TODO
+        intermediate_feature_dims
+            TODO
+        scaled_images_ratios
+            TODO
+        scaled_images_overlap_ratios
+            TODO
+        scaled_images_feature_dims
+            TODO
+        use_batch_norm_in_fusion
+            TODO
+        use_fov_model
+            TODO
+        num_fov_head_layers
+            TODO
 
     Example:
 
@@ -127,8 +130,6 @@ def __init__(
         layerscale_value=1.0,
         drop_path_rate=0.0,
         use_swiglu_ffn=False,
-        out_features=None,
-        out_indices=None,
         apply_layernorm=True,
         reshape_hidden_states=True,
         intermediate_hook_ids = [11, 5],
@@ -137,7 +138,7 @@ def __init__(
         scaled_images_overlap_ratios = [0.0, 0.5, 0.25],
         scaled_images_feature_dims = [1024, 1024, 512],
         use_batch_norm_in_fusion=False,
-        use_fov_model=False,
+        use_fov_model=True,
         num_fov_head_layers=2,
         **kwargs,
     ):
@@ -161,10 +162,6 @@ def __init__(
         self.layerscale_value = layerscale_value
         self.drop_path_rate = drop_path_rate
         self.use_swiglu_ffn = use_swiglu_ffn
-        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, num_hidden_layers + 1)]
-        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
-            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
-        )
         self.apply_layernorm = apply_layernorm
         self.reshape_hidden_states = reshape_hidden_states
         self.use_batch_norm_in_fusion = use_batch_norm_in_fusion

From f13c63208caec6b70a9d8660a42d92ec4c18af3a Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Tue, 26 Nov 2024 16:51:12 +0500
Subject: [PATCH 025/151] apply inference in weight conversion

---
 .../depth_pro/convert_depth_pro_to_hf.py      | 63 ++++++++++++-------
 1 file changed, 39 insertions(+), 24 deletions(-)

diff --git a/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py
index de7bf395a355..7b4552c508ff 100644
--- a/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py
+++ b/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py
@@ -18,24 +18,22 @@
 """
 
 import argparse
-import json
 from pathlib import Path
 import re
 
 import requests
 import torch
-import torch.nn as nn
 from huggingface_hub import hf_hub_download
 from PIL import Image
-from torchvision import transforms
 
-from transformers import BitImageProcessor, Dinov2Config, Dinov2ForImageClassification, Dinov2Model
-from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, PILImageResampling
+from transformers.image_utils import PILImageResampling
 from transformers.utils import logging
 
+# from transformers import DepthProConfig, DepthProImageProcessorFast, DepthProForDepthEstimation
 # TODO: import directly from transformers
 from transformers.models.depth_pro.configuration_depth_pro import DepthProConfig
 from transformers.models.depth_pro.modeling_depth_pro import DepthProForDepthEstimation
+from transformers.models.depth_pro.image_processing_depth_pro_fast import DepthProImageProcessorFast
 
 
 logging.set_verbosity_info()
@@ -147,13 +145,21 @@ def inference_test(processor, model):
 
     predicted_depth = outputs.predicted_depth
     fov = outputs.fov
+    target_sizes = [[image.height, image.width]] * len(predicted_depth)
 
-    predicted_depth, fov = processor.post_process_depth_estimation(predicted_depth, fov)
+    outputs = processor.post_process_depth_estimation(
+        predicted_depths=predicted_depth,
+        fovs=fov,
+        target_sizes=target_sizes,
+    )
+    predicted_depth = outputs['predicted_depth']
+    fov = outputs['fov']
 
-    print("predicted_depth.shape:", predicted_depth.shape)
-    print("fov.shape:", fov.shape)
+    print("\nInference ...")
+    print("predicted_depth:", predicted_depth)
+    print("predicted_depth[0].shape:", predicted_depth[0].shape)
     print("fov:", fov)
-    print("Inference was Successfull!")
+    print("Inference was Successfull!\n")
 
 
 @torch.no_grad()
@@ -167,6 +173,7 @@ def convert_depth_pro_checkpoint(repo_id, filename, pytorch_dump_folder_path, pu
 
     # load original weights from huggingface hub
     file_path = hf_hub_download(repo_id, filename)
+    # file_path = "/home/geetu/work/hf/depth_pro/depth_pro.pt"
     state_dict = torch.load(file_path, weights_only=True)
 
     # enumerate fusion layers
@@ -235,23 +242,31 @@ def convert_depth_pro_checkpoint(repo_id, filename, pytorch_dump_folder_path, pu
     model = DepthProForDepthEstimation(config, use_fov_model=True).eval()
     model.load_state_dict(state_dict)
 
-    # TODO
-    processor = ...
-    # inference_test(processor, model)
+    processor = DepthProImageProcessorFast(
+        do_resize = True,
+        size = {"height": 1536, "width": 1536},
+        resample = PILImageResampling.BILINEAR,
+        antialias = False,
+        do_rescale = True,
+        rescale_factor = 1 / 255,
+        do_normalize = True,
+        image_mean = 0.5,
+        image_std = 0.5,
+        return_tensors = "pt",
+    )
+    inference_test(processor, model)
 
     if pytorch_dump_folder_path is not None:
         Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
         print(f"Saving model to {pytorch_dump_folder_path}")
         model.save_pretrained(pytorch_dump_folder_path)
-        # TODO
-        # print(f"Saving image processor to {pytorch_dump_folder_path}")
-        # processor.save_pretrained(pytorch_dump_folder_path)
-
+        print(f"Saving image processor to {pytorch_dump_folder_path}")
+        processor.save_pretrained(pytorch_dump_folder_path)
 
-    # TODO
-    # if push_to_hub:
-    #     model.push_to_hub("...")
-    #     processor.push_to_hub("...")
+    if push_to_hub:
+        hub_path = "geetu040/DepthPro"
+        model.push_to_hub(hub_path)
+        processor.push_to_hub(hub_path)
 
 
 """
@@ -260,8 +275,8 @@ def convert_depth_pro_checkpoint(repo_id, filename, pytorch_dump_folder_path, pu
 convert_depth_pro_checkpoint(
     "apple/DepthPro",
     "depth_pro.pt",
-    "my_local_dump",
-    False,
+    "my_local_depth_pro_dump",
+    True,
 )
 ```
 
@@ -270,8 +285,8 @@ def convert_depth_pro_checkpoint(repo_id, filename, pytorch_dump_folder_path, pu
 python transformers/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py \
     --repo_id "apple/DepthPro" \
     --filename "depth_pro.pt" \
-    --pytorch_dump_folder_path "my_local_dump" \
-    --push_to_hub 0
+    --pytorch_dump_folder_path "my_local_depth_pro_dump" \
+    --push_to_hub
 ```
 """
 

From 387ddd8c7e50f419d1abcd5a61cd48ea23e0d626 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 27 Nov 2024 10:55:18 +0500
Subject: [PATCH 026/151] use mllama script instead for weight conversion

---
 .../depth_pro/convert_depth_pro_to_hf.py      | 317 ------------------
 .../convert_depth_pro_weights_to_hf.py        | 255 ++++++++++++++
 2 files changed, 255 insertions(+), 317 deletions(-)
 delete mode 100644 src/transformers/models/depth_pro/convert_depth_pro_to_hf.py
 create mode 100644 src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py

diff --git a/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py
deleted file mode 100644
index 7b4552c508ff..000000000000
--- a/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py
+++ /dev/null
@@ -1,317 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert DepthPro checkpoints from the original repository.
-
-URL: https://huggingface.co/apple/DepthPro/tree/main
-"""
-
-import argparse
-from pathlib import Path
-import re
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers.image_utils import PILImageResampling
-from transformers.utils import logging
-
-# from transformers import DepthProConfig, DepthProImageProcessorFast, DepthProForDepthEstimation
-# TODO: import directly from transformers
-from transformers.models.depth_pro.configuration_depth_pro import DepthProConfig
-from transformers.models.depth_pro.modeling_depth_pro import DepthProForDepthEstimation
-from transformers.models.depth_pro.image_processing_depth_pro_fast import DepthProImageProcessorFast
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def create_vit_rename_keys(config):
-    rename_keys = []
-    # fmt: off
-
-    # patch embedding layer
-    rename_keys.append(("cls_token", "embeddings.cls_token"))
-    rename_keys.append(("pos_embed", "embeddings.position_embeddings"))
-    rename_keys.append(("patch_embed.proj.weight", "embeddings.patch_embeddings.projection.weight"))
-    rename_keys.append(("patch_embed.proj.bias", "embeddings.patch_embeddings.projection.bias"))
-
-    for i in range(config.num_hidden_layers):
-        # layernorms
-        rename_keys.append((f"blocks.{i}.norm1.weight", f"encoder.layer.{i}.norm1.weight"))
-        rename_keys.append((f"blocks.{i}.norm1.bias", f"encoder.layer.{i}.norm1.bias"))
-        rename_keys.append((f"blocks.{i}.norm2.weight", f"encoder.layer.{i}.norm2.weight"))
-        rename_keys.append((f"blocks.{i}.norm2.bias", f"encoder.layer.{i}.norm2.bias"))
-        # MLP
-        if config.use_swiglu_ffn:
-            rename_keys.append((f"blocks.{i}.mlp.w12.weight", f"encoder.layer.{i}.mlp.w12.weight"))
-            rename_keys.append((f"blocks.{i}.mlp.w12.bias", f"encoder.layer.{i}.mlp.w12.bias"))
-            rename_keys.append((f"blocks.{i}.mlp.w3.weight", f"encoder.layer.{i}.mlp.w3.weight"))
-            rename_keys.append((f"blocks.{i}.mlp.w3.bias", f"encoder.layer.{i}.mlp.w3.bias"))
-        else:
-            rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"encoder.layer.{i}.mlp.fc1.weight"))
-            rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"encoder.layer.{i}.mlp.fc1.bias"))
-            rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"encoder.layer.{i}.mlp.fc2.weight"))
-            rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"encoder.layer.{i}.mlp.fc2.bias"))
-        # layerscale
-        rename_keys.append((f"blocks.{i}.ls1.gamma", f"encoder.layer.{i}.layer_scale1.lambda1"))
-        rename_keys.append((f"blocks.{i}.ls2.gamma", f"encoder.layer.{i}.layer_scale2.lambda1"))
-        # attention projection layer
-        rename_keys.append((f"blocks.{i}.attn.proj.weight", f"encoder.layer.{i}.attention.output.dense.weight"))
-        rename_keys.append((f"blocks.{i}.attn.proj.bias", f"encoder.layer.{i}.attention.output.dense.bias"))
-
-    # final layernorm
-    rename_keys.append(("norm.weight", "layernorm.weight"))
-    rename_keys.append(("norm.bias", "layernorm.bias"))
-
-    # fmt: on
-    return rename_keys
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_q_k_v(state_dict, config):
-    state_dict_keys = state_dict.keys()
-    for key in list(state_dict_keys):
-        if "qkv" in key:
-            in_proj = state_dict.pop(key)
-            q, k, v = torch.split(in_proj, config.hidden_size, dim=0)
-
-            if "fov" in key:
-                key = key.replace('fov.encoder.0', 'fov_model.encoder')
-            else:
-                key = "depth_pro." + key
-
-            key = key.replace("blocks", "encoder.layer")
-            state_dict[key.replace("attn.qkv", "attention.attention.query")] = q
-            state_dict[key.replace("attn.qkv", "attention.attention.key")] = k
-            state_dict[key.replace("attn.qkv", "attention.attention.value")] = v
-    return state_dict
-
-
-# hard coded upsample keys
-def update_hard_coded_keys(state_dict):
-    mapping = [
-        # upsamples
-        ('encoder.upsample_latent0.0.weight', 'depth_pro.encoder.upsample_intermediate.1.proj.weight'),
-        ('encoder.upsample_latent0.1.weight', 'depth_pro.encoder.upsample_intermediate.1.upsample_blocks.0.weight'),
-        ('encoder.upsample_latent0.2.weight', 'depth_pro.encoder.upsample_intermediate.1.upsample_blocks.1.weight'),
-        ('encoder.upsample_latent0.3.weight', 'depth_pro.encoder.upsample_intermediate.1.upsample_blocks.2.weight'),
-        ('encoder.upsample_latent1.0.weight', 'depth_pro.encoder.upsample_intermediate.0.proj.weight'),
-        ('encoder.upsample_latent1.1.weight', 'depth_pro.encoder.upsample_intermediate.0.upsample_blocks.0.weight'),
-        ('encoder.upsample_latent1.2.weight', 'depth_pro.encoder.upsample_intermediate.0.upsample_blocks.1.weight'),
-        ('encoder.upsample0.0.weight', 'depth_pro.encoder.upsample_scaled_images.2.proj.weight'),
-        ('encoder.upsample0.1.weight', 'depth_pro.encoder.upsample_scaled_images.2.upsample_blocks.0.weight'),
-        ('encoder.upsample1.0.weight', 'depth_pro.encoder.upsample_scaled_images.1.proj.weight'),
-        ('encoder.upsample1.1.weight', 'depth_pro.encoder.upsample_scaled_images.1.upsample_blocks.0.weight'),
-        ('encoder.upsample2.0.weight', 'depth_pro.encoder.upsample_scaled_images.0.proj.weight'),
-        ('encoder.upsample2.1.weight', 'depth_pro.encoder.upsample_scaled_images.0.upsample_blocks.0.weight'),
-        ('encoder.upsample_lowres.weight', 'depth_pro.encoder.upsample_image.upsample_blocks.0.weight'),
-        ('encoder.upsample_lowres.bias', 'depth_pro.encoder.upsample_image.upsample_blocks.0.bias'),
-
-        # neck
-        ("fov.downsample.0.weight", "fov_model.global_neck.0.weight"),
-        ("fov.downsample.0.bias", "fov_model.global_neck.0.bias"),
-        ("fov.encoder.1.weight", "fov_model.encoder_neck.weight"),
-        ("fov.encoder.1.bias", "fov_model.encoder_neck.bias"),
-    ]
-    for src, dest in mapping:
-        state_dict[dest] = state_dict.pop(src)
-    
-    return state_dict
-
-
-# We will verify our results on an image of cute cats
-def inference_test(processor, model):
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
-
-    inputs = processor(image)
-    with torch.no_grad():
-        outputs = model(**inputs)
-
-    predicted_depth = outputs.predicted_depth
-    fov = outputs.fov
-    target_sizes = [[image.height, image.width]] * len(predicted_depth)
-
-    outputs = processor.post_process_depth_estimation(
-        predicted_depths=predicted_depth,
-        fovs=fov,
-        target_sizes=target_sizes,
-    )
-    predicted_depth = outputs['predicted_depth']
-    fov = outputs['fov']
-
-    print("\nInference ...")
-    print("predicted_depth:", predicted_depth)
-    print("predicted_depth[0].shape:", predicted_depth[0].shape)
-    print("fov:", fov)
-    print("Inference was Successfull!\n")
-
-
-@torch.no_grad()
-def convert_depth_pro_checkpoint(repo_id, filename, pytorch_dump_folder_path, push_to_hub=False):
-    """
-    Copy/paste/tweak model's weights to our DepthPro structure.
-    """
-
-    # define default DepthPro configuration
-    config = DepthProConfig(use_fov_model=True)
-
-    # load original weights from huggingface hub
-    file_path = hf_hub_download(repo_id, filename)
-    # file_path = "/home/geetu/work/hf/depth_pro/depth_pro.pt"
-    state_dict = torch.load(file_path, weights_only=True)
-
-    # enumerate fusion layers
-    n_scaled_images = len(config.scaled_images_ratios)       # 3
-    n_intermediate_hooks = len(config.intermediate_hook_ids) # 2
-    n_fusion_layers = n_scaled_images + n_intermediate_hooks # 5
-
-    # 1. keys for vit encoders
-    vit_rename_keys = create_vit_rename_keys(config)
-    for src_prefix, dest_prefix in [
-        ("encoder.patch_encoder", "depth_pro.encoder.patch_encoder"),
-        ("encoder.image_encoder", "depth_pro.encoder.image_encoder"),
-        ("fov.encoder.0", "fov_model.encoder"),
-    ]:
-        for src, dest in vit_rename_keys:
-            src = src_prefix + "." + src
-            dest = dest_prefix + "." + dest
-            state_dict[dest] = state_dict.pop(src)
-
-    # 2. qkv keys for vit encoders
-    state_dict = read_in_q_k_v(state_dict, config)
-
-    # 3. hard coded mapping
-    state_dict = update_hard_coded_keys(state_dict)
-
-
-    for key in list(state_dict.keys()):
-
-        # 4. final depth estimation head
-        if key.startswith("head."):
-            new_key = "head." + key
-
-        # 5. fov model head
-        elif key.startswith("fov.head."):
-            new_key = key.replace("fov", 'fov_model')
-
-        # 6. projections between encoder and fusion
-        elif "decoder.convs." in key:
-            n = re.findall(r'\d+', key)[0] # find digit inside string
-            n = n_fusion_layers - int(n) - 1
-            new_key = f"projections.{n}.weight"
-
-        # 7. fuse low res with image features
-        elif "encoder.fuse_lowres." in key:
-            new_key = key.replace("encoder.fuse_lowres", "depth_pro.encoder.fuse_image_with_low_res")
-
-        # 8. fusion stage (decoder)
-        elif key.startswith("decoder.fusions."):
-            new_key = key.replace("decoder.fusions.", "fusion_stage.layers.")
-            new_key = new_key.replace("resnet1", "residual_layer1")
-            new_key = new_key.replace("resnet2", "residual_layer2")
-            new_key = new_key.replace("residual.1", "convolution1")
-            new_key = new_key.replace("residual.3", "convolution2")
-            new_key = new_key.replace("out_conv", "projection")
-
-            n_with_dots = re.findall(r'.\d+.', new_key)[0] # find digit inside string followed by .
-            n = n_with_dots[1:-1]
-            n = n_fusion_layers - int(n) - 1
-            new_key = new_key.replace(n_with_dots, f".{n}.")
-
-        else:
-            continue
-
-        state_dict[new_key] = state_dict.pop(key)        
-
-    model = DepthProForDepthEstimation(config, use_fov_model=True).eval()
-    model.load_state_dict(state_dict)
-
-    processor = DepthProImageProcessorFast(
-        do_resize = True,
-        size = {"height": 1536, "width": 1536},
-        resample = PILImageResampling.BILINEAR,
-        antialias = False,
-        do_rescale = True,
-        rescale_factor = 1 / 255,
-        do_normalize = True,
-        image_mean = 0.5,
-        image_std = 0.5,
-        return_tensors = "pt",
-    )
-    inference_test(processor, model)
-
-    if pytorch_dump_folder_path is not None:
-        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        print(f"Saving model to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        print(f"Saving image processor to {pytorch_dump_folder_path}")
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        hub_path = "geetu040/DepthPro"
-        model.push_to_hub(hub_path)
-        processor.push_to_hub(hub_path)
-
-
-"""
-- create files locally using function
-```py
-convert_depth_pro_checkpoint(
-    "apple/DepthPro",
-    "depth_pro.pt",
-    "my_local_depth_pro_dump",
-    True,
-)
-```
-
-- create files locally using command line args
-```cmd
-python transformers/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py \
-    --repo_id "apple/DepthPro" \
-    --filename "depth_pro.pt" \
-    --pytorch_dump_folder_path "my_local_depth_pro_dump" \
-    --push_to_hub
-```
-"""
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    # Required parameters
-    parser.add_argument(
-        "--repo_id", default="apple/DepthPro", type=str, help="Name of the repo from huggingface you'd like to convert."
-    )
-    parser.add_argument(
-        "--filename", default="depth_pro.pt", type=str, help="Name of the file from repo you'd like to convert."
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_depth_pro_checkpoint(
-        args.repo_id,
-        args.filename,
-        args.pytorch_dump_folder_path,
-        args.push_to_hub,
-    )
diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
new file mode 100644
index 000000000000..fe862d7469a1
--- /dev/null
+++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
@@ -0,0 +1,255 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import gc
+import os
+
+import regex as re
+import torch
+from huggingface_hub import hf_hub_download
+from transformers.image_utils import PILImageResampling
+
+from transformers.models.depth_pro.configuration_depth_pro import DepthProConfig
+from transformers.models.depth_pro.image_processing_depth_pro_fast import DepthProImageProcessorFast
+from transformers.models.depth_pro.modeling_depth_pro import DepthProForDepthEstimation
+
+
+# fmt: off
+ORIGINAL_TO_CONVERTED_KEY_MAPPING = {
+
+    # patch_encoder/image_encoder (ViT based)
+    r"encoder.(patch|image)_encoder.cls_token": r"depth_pro.encoder.\1_encoder.embeddings.cls_token",
+    r"encoder.(patch|image)_encoder.pos_embed": r"depth_pro.encoder.\1_encoder.embeddings.position_embeddings",
+    r"encoder.(patch|image)_encoder.patch_embed.proj.(weight|bias)": r"depth_pro.encoder.\1_encoder.embeddings.patch_embeddings.projection.\2",
+    r"encoder.(patch|image)_encoder.blocks.(\d+).norm(\d+).(weight|bias)": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.norm\3.\4",
+    r"encoder.(patch|image)_encoder.blocks.(\d+).attn.qkv.(weight|bias)": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.attention.attention.(query|key|value).\3",
+    r"encoder.(patch|image)_encoder.blocks.(\d+).attn.proj.(weight|bias)": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.attention.output.dense.\3",
+    r"encoder.(patch|image)_encoder.blocks.(\d+).ls(\d+).gamma": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.layer_scale\3.lambda1",
+    r"encoder.(patch|image)_encoder.blocks.(\d+).mlp.fc(\d+).(weight|bias)": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.mlp.fc\3.\4",
+    r"encoder.(patch|image)_encoder.norm.(weight|bias)": r"depth_pro.encoder.\1_encoder.layernorm.\2",
+
+	r"encoder.fuse_lowres.(weight|bias)": r"depth_pro.encoder.fuse_image_with_low_res.\1",
+    r"head.(\d+).(weight|bias)": r"head.head.\1.\2",
+    r"decoder.convs.(\d+).weight": lambda match: (
+        f"projections.{4-int(match.group(1))}.weight"
+    ),
+
+    # fov_model.encoder (ViT based)
+    r"fov.encoder.0.cls_token": r"fov_model.encoder.embeddings.cls_token",
+    r"fov.encoder.0.pos_embed": r"fov_model.encoder.embeddings.position_embeddings",
+    r"fov.encoder.0.patch_embed.proj.(weight|bias)": r"fov_model.encoder.embeddings.patch_embeddings.projection.\1",
+    r"fov.encoder.0.blocks.(\d+).norm(\d+).(weight|bias)": r"fov_model.encoder.encoder.layer.\1.norm\2.\3",
+    r"fov.encoder.0.blocks.(\d+).attn.qkv.(weight|bias)": r"fov_model.encoder.encoder.layer.\1.attention.attention.(query|key|value).\2",
+    r"fov.encoder.0.blocks.(\d+).attn.proj.(weight|bias)": r"fov_model.encoder.encoder.layer.\1.attention.output.dense.\2",
+    r"fov.encoder.0.blocks.(\d+).ls(\d+).gamma": r"fov_model.encoder.encoder.layer.\1.layer_scale\2.lambda1",
+    r"fov.encoder.0.blocks.(\d+).mlp.fc(\d+).(weight|bias)": r"fov_model.encoder.encoder.layer.\1.mlp.fc\2.\3",
+    r"fov.encoder.0.norm.(weight|bias)": r"fov_model.encoder.layernorm.\1",
+
+    # fov head
+    r"fov.downsample.(\d+).(weight|bias)": r"fov_model.global_neck.\1.\2",
+    r"fov.encoder.1.(weight|bias)": r"fov_model.encoder_neck.\1",
+    r"fov.head.head.(\d+).(weight|bias)": r"fov_model.head.\1.\2",
+
+    # fusion stage
+    r"decoder.fusions.(\d+).resnet(\d+).residual.(\d+).(weight|bias)": lambda match: (
+        f"fusion_stage.layers.{4-int(match.group(1))}.residual_layer{match.group(2)}.convolution{(int(match.group(3))+1)//2}.{match.group(4)}"
+    ),
+    r"decoder.fusions.(\d+).out_conv.(weight|bias)": lambda match: (
+        f"fusion_stage.layers.{4-int(match.group(1))}.projection.{match.group(2)}"
+    ),
+    r"decoder.fusions.(\d+).deconv.(weight|bias)": lambda match: (
+        f"fusion_stage.layers.{4-int(match.group(1))}.deconv.{match.group(2)}"
+    ),
+
+    # qkv attentions blocks
+
+    # upsamples (hard coded; regex is not very feasible here)
+	"encoder.upsample_latent0.0.weight": "depth_pro.encoder.upsample_intermediate.1.proj.weight",
+	"encoder.upsample_latent0.1.weight": "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.0.weight",
+	"encoder.upsample_latent0.2.weight": "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.1.weight",
+	"encoder.upsample_latent0.3.weight": "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.2.weight",
+	"encoder.upsample_latent1.0.weight": "depth_pro.encoder.upsample_intermediate.0.proj.weight",
+	"encoder.upsample_latent1.1.weight": "depth_pro.encoder.upsample_intermediate.0.upsample_blocks.0.weight",
+	"encoder.upsample_latent1.2.weight": "depth_pro.encoder.upsample_intermediate.0.upsample_blocks.1.weight",
+	"encoder.upsample0.0.weight": "depth_pro.encoder.upsample_scaled_images.2.proj.weight",
+	"encoder.upsample0.1.weight": "depth_pro.encoder.upsample_scaled_images.2.upsample_blocks.0.weight",
+	"encoder.upsample1.0.weight": "depth_pro.encoder.upsample_scaled_images.1.proj.weight",
+	"encoder.upsample1.1.weight": "depth_pro.encoder.upsample_scaled_images.1.upsample_blocks.0.weight",
+	"encoder.upsample2.0.weight": "depth_pro.encoder.upsample_scaled_images.0.proj.weight",
+	"encoder.upsample2.1.weight": "depth_pro.encoder.upsample_scaled_images.0.upsample_blocks.0.weight",
+    "encoder.upsample_lowres.weight": "depth_pro.encoder.upsample_image.upsample_blocks.0.weight",
+    "encoder.upsample_lowres.bias": "depth_pro.encoder.upsample_image.upsample_blocks.0.bias",
+}
+# fmt: on
+
+def convert_old_keys_to_new_keys(state_dict_keys: dict = None):
+    output_dict = {}
+    if state_dict_keys is not None:
+        old_text = "\n".join(state_dict_keys)
+        new_text = old_text
+        for pattern, replacement in ORIGINAL_TO_CONVERTED_KEY_MAPPING.items():
+            if replacement is None:
+                new_text = re.sub(pattern, "", new_text)  # an empty line
+                continue
+            new_text = re.sub(pattern, replacement, new_text)
+        output_dict = dict(zip(old_text.split("\n"), new_text.split("\n")))
+    return output_dict
+
+def get_qkv_state_dict(key, parameter):
+    qkv_state_dict = {}
+    placeholder = re.search(r'(\(.*?\))', key).group(1)
+    replacements_keys = placeholder[1:-1].split("|")
+    replacements_vals = torch.split(
+        parameter,
+        split_size_or_sections=parameter.size(0)//len(replacements_keys),
+        dim=0
+    )
+    for replacement_key, replacement_val in zip(replacements_keys, replacements_vals):
+        qkv_state_dict[key.replace(placeholder, replacement_key)] = replacement_val
+    return qkv_state_dict        
+
+def write_model(
+    hf_repo_id: str,
+    output_dir: str,
+    safe_serialization: bool=True,
+):
+    os.makedirs(output_dir, exist_ok=True)
+
+    # ------------------------------------------------------------
+    # Create and save config
+    # ------------------------------------------------------------
+
+    # create config
+    config = DepthProConfig(
+        # this config is same as the default config and used for pre-trained weights
+        hidden_size=1024,
+        fusion_hidden_size=256,
+        num_hidden_layers=24,
+        num_attention_heads=16,
+        mlp_ratio=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        initializer_range=0.02,
+        layer_norm_eps=1e-6,
+        image_size=1536,
+        patch_size=384,
+        num_channels=3,
+        patch_embeddings_size=16,
+        qkv_bias=True,
+        layerscale_value=1.0,
+        drop_path_rate=0.0,
+        use_swiglu_ffn=False,
+        apply_layernorm=True,
+        reshape_hidden_states=True,
+        intermediate_hook_ids = [11, 5],
+        intermediate_feature_dims = [256, 256],
+        scaled_images_ratios = [0.25, 0.5, 1],
+        scaled_images_overlap_ratios = [0.0, 0.5, 0.25],
+        scaled_images_feature_dims = [1024, 1024, 512],
+        use_batch_norm_in_fusion=False,
+        use_fov_model=True,
+        num_fov_head_layers=2,
+    )
+
+    # save config
+    config.save_pretrained(output_dir)
+    print("Model config saved successfully...")
+
+    # ------------------------------------------------------------
+    # Convert weights
+    # ------------------------------------------------------------
+
+    # downlaod and load state_dict from hf repo
+    file_path = hf_hub_download(hf_repo_id, "depth_pro.pt")
+    # file_path = "/home/geetu/work/hf/depth_pro/depth_pro.pt" # when you already have the files locally
+    loaded = torch.load(file_path, weights_only=True)
+
+    print("Converting model...")
+    all_keys = list(loaded.keys())
+    new_keys = convert_old_keys_to_new_keys(all_keys)
+
+    state_dict = {}
+    for key in all_keys:
+        new_key = new_keys[key]
+        current_parameter = loaded.pop(key)
+
+        if "qkv" in key:
+            qkv_state_dict = get_qkv_state_dict(new_key, current_parameter)
+            state_dict.update(qkv_state_dict)
+        else:
+            state_dict[new_key] = current_parameter
+
+    print("Loading the checkpoint in a DepthPro model.")
+    model = DepthProForDepthEstimation(config)
+    model.load_state_dict(state_dict, strict=True, assign=True)
+    print("Checkpoint loaded successfully.")
+
+    print("Saving the model.")
+    model.save_pretrained(output_dir, safe_serialization=safe_serialization)
+    del state_dict, model
+
+    # Safety check: reload the converted model
+    gc.collect()
+    print("Reloading the model to check if it's saved correctly.")
+    DepthProForDepthEstimation.from_pretrained(output_dir, torch_dtype=torch.bfloat16, device_map="auto")
+    print("Model reloaded successfully.")
+
+def write_image_processor(output_dir: str):
+    image_processor = DepthProImageProcessorFast(
+        do_resize = True,
+        size = {"height": 1536, "width": 1536},
+        resample = PILImageResampling.BILINEAR,
+        antialias = False,
+        do_rescale = True,
+        rescale_factor = 1 / 255,
+        do_normalize = True,
+        image_mean = 0.5,
+        image_std = 0.5,
+        return_tensors = "pt",
+    )
+    image_processor.save_pretrained(output_dir)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--hf_repo_id",
+        default="apple/DepthPro",
+        help="Location of official weights from apple on HF",
+    )
+    parser.add_argument(
+        "--output_dir",
+        default="apple_DepthPro",
+        help="Location to write HF model and processor",
+    )
+    parser.add_argument(
+        "--safe_serialization", default=True, type=bool, help="Whether or not to save using `safetensors`."
+    )
+    args = parser.parse_args()
+
+    write_model(
+        hf_repo_id=args.hf_repo_id,
+        output_dir=args.output_dir,
+        safe_serialization=args.safe_serialization,
+    )
+
+    write_image_processor(
+        output_dir=args.output_dir,
+    )
+
+
+if __name__ == "__main__":
+    main()

From 9b67f9d2afc1b081a4990149eb16ea906ce09295 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 27 Nov 2024 11:09:50 +0500
Subject: [PATCH 027/151] clean weight conversion script

---
 .../convert_depth_pro_weights_to_hf.py        | 106 +++++++++---------
 1 file changed, 56 insertions(+), 50 deletions(-)

diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
index fe862d7469a1..0b81e8907e29 100644
--- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
+++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
@@ -29,39 +29,55 @@
 # fmt: off
 ORIGINAL_TO_CONVERTED_KEY_MAPPING = {
 
-    # patch_encoder/image_encoder (ViT based)
-    r"encoder.(patch|image)_encoder.cls_token": r"depth_pro.encoder.\1_encoder.embeddings.cls_token",
-    r"encoder.(patch|image)_encoder.pos_embed": r"depth_pro.encoder.\1_encoder.embeddings.position_embeddings",
-    r"encoder.(patch|image)_encoder.patch_embed.proj.(weight|bias)": r"depth_pro.encoder.\1_encoder.embeddings.patch_embeddings.projection.\2",
-    r"encoder.(patch|image)_encoder.blocks.(\d+).norm(\d+).(weight|bias)": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.norm\3.\4",
-    r"encoder.(patch|image)_encoder.blocks.(\d+).attn.qkv.(weight|bias)": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.attention.attention.(query|key|value).\3",
-    r"encoder.(patch|image)_encoder.blocks.(\d+).attn.proj.(weight|bias)": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.attention.output.dense.\3",
-    r"encoder.(patch|image)_encoder.blocks.(\d+).ls(\d+).gamma": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.layer_scale\3.lambda1",
-    r"encoder.(patch|image)_encoder.blocks.(\d+).mlp.fc(\d+).(weight|bias)": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.mlp.fc\3.\4",
-    r"encoder.(patch|image)_encoder.norm.(weight|bias)": r"depth_pro.encoder.\1_encoder.layernorm.\2",
-
-	r"encoder.fuse_lowres.(weight|bias)": r"depth_pro.encoder.fuse_image_with_low_res.\1",
-    r"head.(\d+).(weight|bias)": r"head.head.\1.\2",
+    # encoder and head
+    r"encoder.(patch|image)_encoder.cls_token":                                 r"depth_pro.encoder.\1_encoder.embeddings.cls_token",
+    r"encoder.(patch|image)_encoder.pos_embed":                                 r"depth_pro.encoder.\1_encoder.embeddings.position_embeddings",
+    r"encoder.(patch|image)_encoder.patch_embed.proj.(weight|bias)":            r"depth_pro.encoder.\1_encoder.embeddings.patch_embeddings.projection.\2",
+    r"encoder.(patch|image)_encoder.blocks.(\d+).norm(\d+).(weight|bias)":      r"depth_pro.encoder.\1_encoder.encoder.layer.\2.norm\3.\4",
+    r"encoder.(patch|image)_encoder.blocks.(\d+).attn.qkv.(weight|bias)":       r"depth_pro.encoder.\1_encoder.encoder.layer.\2.attention.attention.(query|key|value).\3",
+    r"encoder.(patch|image)_encoder.blocks.(\d+).attn.proj.(weight|bias)":      r"depth_pro.encoder.\1_encoder.encoder.layer.\2.attention.output.dense.\3",
+    r"encoder.(patch|image)_encoder.blocks.(\d+).ls(\d+).gamma":                r"depth_pro.encoder.\1_encoder.encoder.layer.\2.layer_scale\3.lambda1",
+    r"encoder.(patch|image)_encoder.blocks.(\d+).mlp.fc(\d+).(weight|bias)":    r"depth_pro.encoder.\1_encoder.encoder.layer.\2.mlp.fc\3.\4",
+    r"encoder.(patch|image)_encoder.norm.(weight|bias)":                        r"depth_pro.encoder.\1_encoder.layernorm.\2",
+	r"encoder.fuse_lowres.(weight|bias)":                                       r"depth_pro.encoder.fuse_image_with_low_res.\1",
+    r"head.(\d+).(weight|bias)":                                                r"head.head.\1.\2",
+
+    # fov
+    r"fov.encoder.0.cls_token":                                                 r"fov_model.encoder.embeddings.cls_token",
+    r"fov.encoder.0.pos_embed":                                                 r"fov_model.encoder.embeddings.position_embeddings",
+    r"fov.encoder.0.patch_embed.proj.(weight|bias)":                            r"fov_model.encoder.embeddings.patch_embeddings.projection.\1",
+    r"fov.encoder.0.blocks.(\d+).norm(\d+).(weight|bias)":                      r"fov_model.encoder.encoder.layer.\1.norm\2.\3",
+    r"fov.encoder.0.blocks.(\d+).attn.qkv.(weight|bias)":                       r"fov_model.encoder.encoder.layer.\1.attention.attention.(query|key|value).\2",
+    r"fov.encoder.0.blocks.(\d+).attn.proj.(weight|bias)":                      r"fov_model.encoder.encoder.layer.\1.attention.output.dense.\2",
+    r"fov.encoder.0.blocks.(\d+).ls(\d+).gamma":                                r"fov_model.encoder.encoder.layer.\1.layer_scale\2.lambda1",
+    r"fov.encoder.0.blocks.(\d+).mlp.fc(\d+).(weight|bias)":                    r"fov_model.encoder.encoder.layer.\1.mlp.fc\2.\3",
+    r"fov.encoder.0.norm.(weight|bias)":                                        r"fov_model.encoder.layernorm.\1",
+    r"fov.downsample.(\d+).(weight|bias)":                                      r"fov_model.global_neck.\1.\2",
+    r"fov.encoder.1.(weight|bias)":                                             r"fov_model.encoder_neck.\1",
+    r"fov.head.head.(\d+).(weight|bias)":                                       r"fov_model.head.\1.\2",
+
+    # upsamples (hard coded; regex is not very feasible here)
+	"encoder.upsample_latent0.0.weight":                                        "depth_pro.encoder.upsample_intermediate.1.proj.weight",
+	"encoder.upsample_latent0.1.weight":                                        "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.0.weight",
+	"encoder.upsample_latent0.2.weight":                                        "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.1.weight",
+	"encoder.upsample_latent0.3.weight":                                        "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.2.weight",
+	"encoder.upsample_latent1.0.weight":                                        "depth_pro.encoder.upsample_intermediate.0.proj.weight",
+	"encoder.upsample_latent1.1.weight":                                        "depth_pro.encoder.upsample_intermediate.0.upsample_blocks.0.weight",
+	"encoder.upsample_latent1.2.weight":                                        "depth_pro.encoder.upsample_intermediate.0.upsample_blocks.1.weight",
+	"encoder.upsample0.0.weight":                                               "depth_pro.encoder.upsample_scaled_images.2.proj.weight",
+	"encoder.upsample0.1.weight":                                               "depth_pro.encoder.upsample_scaled_images.2.upsample_blocks.0.weight",
+	"encoder.upsample1.0.weight":                                               "depth_pro.encoder.upsample_scaled_images.1.proj.weight",
+	"encoder.upsample1.1.weight":                                               "depth_pro.encoder.upsample_scaled_images.1.upsample_blocks.0.weight",
+	"encoder.upsample2.0.weight":                                               "depth_pro.encoder.upsample_scaled_images.0.proj.weight",
+	"encoder.upsample2.1.weight":                                               "depth_pro.encoder.upsample_scaled_images.0.upsample_blocks.0.weight",
+    "encoder.upsample_lowres.weight":                                           "depth_pro.encoder.upsample_image.upsample_blocks.0.weight",
+    "encoder.upsample_lowres.bias":                                             "depth_pro.encoder.upsample_image.upsample_blocks.0.bias",
+
+    # projections between encoder and fusion
     r"decoder.convs.(\d+).weight": lambda match: (
         f"projections.{4-int(match.group(1))}.weight"
     ),
 
-    # fov_model.encoder (ViT based)
-    r"fov.encoder.0.cls_token": r"fov_model.encoder.embeddings.cls_token",
-    r"fov.encoder.0.pos_embed": r"fov_model.encoder.embeddings.position_embeddings",
-    r"fov.encoder.0.patch_embed.proj.(weight|bias)": r"fov_model.encoder.embeddings.patch_embeddings.projection.\1",
-    r"fov.encoder.0.blocks.(\d+).norm(\d+).(weight|bias)": r"fov_model.encoder.encoder.layer.\1.norm\2.\3",
-    r"fov.encoder.0.blocks.(\d+).attn.qkv.(weight|bias)": r"fov_model.encoder.encoder.layer.\1.attention.attention.(query|key|value).\2",
-    r"fov.encoder.0.blocks.(\d+).attn.proj.(weight|bias)": r"fov_model.encoder.encoder.layer.\1.attention.output.dense.\2",
-    r"fov.encoder.0.blocks.(\d+).ls(\d+).gamma": r"fov_model.encoder.encoder.layer.\1.layer_scale\2.lambda1",
-    r"fov.encoder.0.blocks.(\d+).mlp.fc(\d+).(weight|bias)": r"fov_model.encoder.encoder.layer.\1.mlp.fc\2.\3",
-    r"fov.encoder.0.norm.(weight|bias)": r"fov_model.encoder.layernorm.\1",
-
-    # fov head
-    r"fov.downsample.(\d+).(weight|bias)": r"fov_model.global_neck.\1.\2",
-    r"fov.encoder.1.(weight|bias)": r"fov_model.encoder_neck.\1",
-    r"fov.head.head.(\d+).(weight|bias)": r"fov_model.head.\1.\2",
-
     # fusion stage
     r"decoder.fusions.(\d+).resnet(\d+).residual.(\d+).(weight|bias)": lambda match: (
         f"fusion_stage.layers.{4-int(match.group(1))}.residual_layer{match.group(2)}.convolution{(int(match.group(3))+1)//2}.{match.group(4)}"
@@ -72,25 +88,6 @@
     r"decoder.fusions.(\d+).deconv.(weight|bias)": lambda match: (
         f"fusion_stage.layers.{4-int(match.group(1))}.deconv.{match.group(2)}"
     ),
-
-    # qkv attentions blocks
-
-    # upsamples (hard coded; regex is not very feasible here)
-	"encoder.upsample_latent0.0.weight": "depth_pro.encoder.upsample_intermediate.1.proj.weight",
-	"encoder.upsample_latent0.1.weight": "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.0.weight",
-	"encoder.upsample_latent0.2.weight": "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.1.weight",
-	"encoder.upsample_latent0.3.weight": "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.2.weight",
-	"encoder.upsample_latent1.0.weight": "depth_pro.encoder.upsample_intermediate.0.proj.weight",
-	"encoder.upsample_latent1.1.weight": "depth_pro.encoder.upsample_intermediate.0.upsample_blocks.0.weight",
-	"encoder.upsample_latent1.2.weight": "depth_pro.encoder.upsample_intermediate.0.upsample_blocks.1.weight",
-	"encoder.upsample0.0.weight": "depth_pro.encoder.upsample_scaled_images.2.proj.weight",
-	"encoder.upsample0.1.weight": "depth_pro.encoder.upsample_scaled_images.2.upsample_blocks.0.weight",
-	"encoder.upsample1.0.weight": "depth_pro.encoder.upsample_scaled_images.1.proj.weight",
-	"encoder.upsample1.1.weight": "depth_pro.encoder.upsample_scaled_images.1.upsample_blocks.0.weight",
-	"encoder.upsample2.0.weight": "depth_pro.encoder.upsample_scaled_images.0.proj.weight",
-	"encoder.upsample2.1.weight": "depth_pro.encoder.upsample_scaled_images.0.upsample_blocks.0.weight",
-    "encoder.upsample_lowres.weight": "depth_pro.encoder.upsample_image.upsample_blocks.0.weight",
-    "encoder.upsample_lowres.bias": "depth_pro.encoder.upsample_image.upsample_blocks.0.bias",
 }
 # fmt: on
 
@@ -108,9 +105,18 @@ def convert_old_keys_to_new_keys(state_dict_keys: dict = None):
     return output_dict
 
 def get_qkv_state_dict(key, parameter):
+    """
+    new key which looks like this
+    xxxx.(q|k|v).xxx    (m, n)
+
+    is converted to
+    xxxx.q.xxxx         (m//3, n)
+    xxxx.k.xxxx         (m//3, n)
+    xxxx.v.xxxx         (m//3, n)
+    """
     qkv_state_dict = {}
-    placeholder = re.search(r'(\(.*?\))', key).group(1)
-    replacements_keys = placeholder[1:-1].split("|")
+    placeholder = re.search(r'(\(.*?\))', key).group(1) # finds   "(query|key|value)"
+    replacements_keys = placeholder[1:-1].split("|")    # creates ['query', 'key', 'value']
     replacements_vals = torch.split(
         parameter,
         split_size_or_sections=parameter.size(0)//len(replacements_keys),

From 617c872fb90d313f03fc55962088127e659241c7 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 27 Nov 2024 12:57:50 +0500
Subject: [PATCH 028/151] add depth-pro status in other files

---
 src/transformers/__init__.py                  | 16 +++++
 .../models/auto/configuration_auto.py         |  2 +
 .../models/auto/image_processing_auto.py      |  1 +
 src/transformers/models/auto/modeling_auto.py |  3 +
 src/transformers/models/depth_pro/__init__.py | 72 +++++++++++++++++++
 .../convert_depth_pro_weights_to_hf.py        |  8 ++-
 .../depth_pro/image_processing_depth_pro.py   |  2 -
 utils/check_docstrings.py                     |  1 +
 utils/check_repo.py                           |  1 +
 9 files changed, 101 insertions(+), 5 deletions(-)
 create mode 100644 src/transformers/models/depth_pro/__init__.py

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 47b43e0b9089..3d0b85e3a1b4 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -408,6 +408,7 @@
         "DPRReaderTokenizer",
     ],
     "models.dpt": ["DPTConfig"],
+    "models.depth_pro": ["DepthProConfig"],
     "models.efficientnet": ["EfficientNetConfig"],
     "models.electra": [
         "ElectraConfig",
@@ -1195,6 +1196,7 @@
     _import_structure["models.detr"].extend(["DetrFeatureExtractor", "DetrImageProcessor", "DetrImageProcessorFast"])
     _import_structure["models.donut"].extend(["DonutFeatureExtractor", "DonutImageProcessor"])
     _import_structure["models.dpt"].extend(["DPTFeatureExtractor", "DPTImageProcessor"])
+    _import_structure["models.depth_pro"].extend(["DepthProImageProcessor", "DepthProImageProcessorFast"])
     _import_structure["models.efficientnet"].append("EfficientNetImageProcessor")
     _import_structure["models.flava"].extend(["FlavaFeatureExtractor", "FlavaImageProcessor", "FlavaProcessor"])
     _import_structure["models.fuyu"].extend(["FuyuImageProcessor", "FuyuProcessor"])
@@ -2136,6 +2138,13 @@
             "DPTPreTrainedModel",
         ]
     )
+    _import_structure["models.depth_pro"].extend(
+        [
+            "DepthProForDepthEstimation",
+            "DepthProModel",
+            "DepthProPreTrainedModel",
+        ]
+    )
     _import_structure["models.efficientnet"].extend(
         [
             "EfficientNetForImageClassification",
@@ -5272,6 +5281,7 @@
         DPRReaderTokenizer,
     )
     from .models.dpt import DPTConfig
+    from .models.depth_pro import DepthProConfig
     from .models.efficientnet import (
         EfficientNetConfig,
     )
@@ -6100,6 +6110,7 @@
         from .models.detr import DetrFeatureExtractor, DetrImageProcessor, DetrImageProcessorFast
         from .models.donut import DonutFeatureExtractor, DonutImageProcessor
         from .models.dpt import DPTFeatureExtractor, DPTImageProcessor
+        from .models.depth_pro import DepthProImageProcessor, DepthProImageProcessorFast
         from .models.efficientnet import EfficientNetImageProcessor
         from .models.flava import (
             FlavaFeatureExtractor,
@@ -6907,6 +6918,11 @@
             DPTModel,
             DPTPreTrainedModel,
         )
+        from .models.depth_pro import (
+            DepthProForDepthEstimation,
+            DepthProModel,
+            DepthProPreTrainedModel,
+        )
         from .models.efficientnet import (
             EfficientNetForImageClassification,
             EfficientNetModel,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 48625ea3f346..d8860d38f850 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -93,6 +93,7 @@
         ("donut-swin", "DonutSwinConfig"),
         ("dpr", "DPRConfig"),
         ("dpt", "DPTConfig"),
+        ("depth_pro", "DepthProConfig"),
         ("efficientformer", "EfficientFormerConfig"),
         ("efficientnet", "EfficientNetConfig"),
         ("electra", "ElectraConfig"),
@@ -394,6 +395,7 @@
         ("donut-swin", "DonutSwin"),
         ("dpr", "DPR"),
         ("dpt", "DPT"),
+        ("depth_pro", "DepthPro"),
         ("efficientformer", "EfficientFormer"),
         ("efficientnet", "EfficientNet"),
         ("electra", "ELECTRA"),
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index a8960d80acc8..e7b53f30a7a0 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -77,6 +77,7 @@
             ("dinov2", ("BitImageProcessor",)),
             ("donut-swin", ("DonutImageProcessor",)),
             ("dpt", ("DPTImageProcessor",)),
+            ("depth_pro", ("DepthProImageProcessor", "DepthProImageProcessorFast")),
             ("efficientformer", ("EfficientFormerImageProcessor",)),
             ("efficientnet", ("EfficientNetImageProcessor",)),
             ("flava", ("FlavaImageProcessor",)),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 67c539fca664..4cc15ca4ca51 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -92,6 +92,7 @@
         ("donut-swin", "DonutSwinModel"),
         ("dpr", "DPRQuestionEncoder"),
         ("dpt", "DPTModel"),
+        ("depth_pro", "DepthProModel"),
         ("efficientformer", "EfficientFormerModel"),
         ("efficientnet", "EfficientNetModel"),
         ("electra", "ElectraModel"),
@@ -571,6 +572,7 @@
         ("dinat", "DinatModel"),
         ("dinov2", "Dinov2Model"),
         ("dpt", "DPTModel"),
+        ("depth_pro", "DepthProModel"),
         ("efficientformer", "EfficientFormerModel"),
         ("efficientnet", "EfficientNetModel"),
         ("focalnet", "FocalNetModel"),
@@ -866,6 +868,7 @@
         # Model for depth estimation mapping
         ("depth_anything", "DepthAnythingForDepthEstimation"),
         ("dpt", "DPTForDepthEstimation"),
+        ("depth_pro", "DepthProForDepthEstimation"),
         ("glpn", "GLPNForDepthEstimation"),
         ("zoedepth", "ZoeDepthForDepthEstimation"),
     ]
diff --git a/src/transformers/models/depth_pro/__init__.py b/src/transformers/models/depth_pro/__init__.py
new file mode 100644
index 000000000000..1f2a6646c5c0
--- /dev/null
+++ b/src/transformers/models/depth_pro/__init__.py
@@ -0,0 +1,72 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...file_utils import _LazyModule, is_torch_available, is_vision_available
+from ...utils import OptionalDependencyNotAvailable
+
+
+_import_structure = {"configuration_depth_pro": ["DepthProConfig"]}
+
+try:
+    if not is_vision_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["image_processing_depth_pro"] = ["DepthProImageProcessor"]
+    _import_structure["image_processing_depth_pro_fast"] = ["DepthProImageProcessorFast"]
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_depth_pro"] = [
+        "DepthProForDepthEstimation",
+        "DepthProModel",
+        "DepthProPreTrainedModel",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_depth_pro import DepthProConfig
+
+    try:
+        if not is_vision_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .image_processing_depth_pro import DepthProImageProcessor
+        from .image_processing_depth_pro_fast import DepthProImageProcessorFast
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_depth_pro import (
+            DepthProForDepthEstimation,
+            DepthProModel,
+            DepthProPreTrainedModel,
+        )
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
index 0b81e8907e29..741016e88a3d 100644
--- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
+++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
@@ -21,9 +21,11 @@
 from huggingface_hub import hf_hub_download
 from transformers.image_utils import PILImageResampling
 
-from transformers.models.depth_pro.configuration_depth_pro import DepthProConfig
-from transformers.models.depth_pro.image_processing_depth_pro_fast import DepthProImageProcessorFast
-from transformers.models.depth_pro.modeling_depth_pro import DepthProForDepthEstimation
+from transformers import (
+    DepthProConfig,
+    DepthProImageProcessorFast,
+    DepthProForDepthEstimation,
+)
 
 
 # fmt: off
diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py
index 0a7313e2d19a..99a7c26c9826 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py
@@ -72,8 +72,6 @@
     requires_backends,
 )
 
-from transformers.models.depth_pro.modeling_depth_pro import DepthProDepthEstimatorOutput
-
 
 if is_torch_available():
     import torch
diff --git a/utils/check_docstrings.py b/utils/check_docstrings.py
index 0be960f4a33e..34deed0df47e 100644
--- a/utils/check_docstrings.py
+++ b/utils/check_docstrings.py
@@ -140,6 +140,7 @@
     "DPRReaderTokenizer",
     "DPRReaderTokenizerFast",
     "DPTModel",
+    "DepthProModel",
     "Data2VecAudioConfig",
     "Data2VecTextConfig",
     "Data2VecTextModel",
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 10be5cdcd262..2e131e879153 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -213,6 +213,7 @@
     "JukeboxPrior",
     "SamModel",
     "DPTForDepthEstimation",
+    "DepthProForDepthEstimation",
     "DecisionTransformerGPT2Model",
     "GLPNForDepthEstimation",
     "ViltForImagesAndTextClassification",

From 6e1c512b15474979ea3176e85214ccc70fcc6cd7 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 27 Nov 2024 13:33:25 +0500
Subject: [PATCH 029/151] fill docstring in config

---
 .../depth_pro/configuration_depth_pro.py      | 32 ++++++++++---------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py
index fae3e84432be..9b53288c41ed 100644
--- a/src/transformers/models/depth_pro/configuration_depth_pro.py
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@@ -34,8 +34,8 @@ class DepthProConfig(PretrainedConfig):
     Args:
         hidden_size (`int`, *optional*, defaults to 1024):
             Dimensionality of the encoder layers and the pooler layer.
-        fusion_hidden_size
-            TODO
+        fusion_hidden_size (`int`, *optional*, defaults to 256):
+            The number of channels before fusion.
         num_hidden_layers (`int`, *optional*, defaults to 24):
             Number of hidden layers in the Transformer encoder.
         num_attention_heads (`int`, *optional*, defaults to 12):
@@ -53,15 +53,17 @@ class DepthProConfig(PretrainedConfig):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-06):
             The epsilon used by the layer normalization layers.
-        image_size (`int`, *optional*, defaults to 224):
-            TODO: image_size / 2**n_fusion_blocks = patch_size / patch_embeddings_size
-            The size (resolution) of each image.
+        image_size (`int`, *optional*, defaults to 1536):
+            The size (resolution) of each image,
+            To generate depth of same size as image,
+            image_size / 2**n_fusion_blocks == patch_size / patch_embeddings_size
+            where n_fusion_blocks = len(intermediate_hook_ids) + len(scaled_images_ratios)
         patch_size (`int`, *optional*, defaults to 14):
             The size (resolution) of each patch.
         num_channels (`int`, *optional*, defaults to 3):
             The number of input channels.
-        patch_embeddings_size
-            TODO
+        patch_embeddings_size (`int`, *optional*, defaults to 16):
+            kernel_size and stride for convolution in PatchEmbeddings.
         qkv_bias (`bool`, *optional*, defaults to `True`):
             Whether to add a bias to the queries, keys and values.
         layerscale_value (`float`, *optional*, defaults to 1.0):
@@ -77,21 +79,21 @@ class DepthProConfig(PretrainedConfig):
             case the model is used as backbone. If `False`, the feature maps will be 3D tensors of shape `(batch_size,
             seq_len, hidden_size)`.
         intermediate_hook_ids
-            TODO
+            Indices of the intermediate hidden states from patch_encoder to use for fusion.
         intermediate_feature_dims
-            TODO
+            Hidden state during upsampling for each intermediate hidden states in intermediate_hook_ids.
         scaled_images_ratios
-            TODO
+            Use images of these ratios for patch_encoder.
         scaled_images_overlap_ratios
-            TODO
+            Overlap ratio between patches for each scaled image in scaled_image_ratios.
         scaled_images_feature_dims
-            TODO
+            Hidden state during upsampling for each scaled image in scaled_images_ratios.
         use_batch_norm_in_fusion
-            TODO
+            Whether to use batch normalization in the residual units of the fusion blocks.
         use_fov_model
-            TODO
+            Whether to use `DepthProFOVModel` to generate Field of View.
         num_fov_head_layers
-            TODO
+            No of convolution layers in head of `DepthProFOVModel`.
 
     Example:
 

From 12ee607e5d319a488d7e807a75927cb86f463cec Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 27 Nov 2024 18:47:53 +0500
Subject: [PATCH 030/151] formatting

---
 .../depth_pro/configuration_depth_pro.py      |  2 +-
 .../convert_depth_pro_weights_to_hf.py        | 28 ++++-----
 .../depth_pro/image_processing_depth_pro.py   | 48 +++++++++------
 .../image_processing_depth_pro_fast.py        | 40 ++++++++-----
 .../models/depth_pro/modeling_depth_pro.py    | 58 ++++++-------------
 5 files changed, 88 insertions(+), 88 deletions(-)

diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py
index 9b53288c41ed..8bab8227be7e 100644
--- a/src/transformers/models/depth_pro/configuration_depth_pro.py
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@@ -67,7 +67,7 @@ class DepthProConfig(PretrainedConfig):
         qkv_bias (`bool`, *optional*, defaults to `True`):
             Whether to add a bias to the queries, keys and values.
         layerscale_value (`float`, *optional*, defaults to 1.0):
-           Initial value to use for layer scale.
+            Initial value to use for layer scale.
         drop_path_rate (`float`, *optional*, defaults to 0.0):
             Stochastic depth rate per sample (when applied in the main path of residual layers).
         use_swiglu_ffn (`bool`, *optional*, defaults to `False`):
diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
index 741016e88a3d..c3b77f17f04c 100644
--- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
+++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
@@ -41,7 +41,7 @@
     r"encoder.(patch|image)_encoder.blocks.(\d+).ls(\d+).gamma":                r"depth_pro.encoder.\1_encoder.encoder.layer.\2.layer_scale\3.lambda1",
     r"encoder.(patch|image)_encoder.blocks.(\d+).mlp.fc(\d+).(weight|bias)":    r"depth_pro.encoder.\1_encoder.encoder.layer.\2.mlp.fc\3.\4",
     r"encoder.(patch|image)_encoder.norm.(weight|bias)":                        r"depth_pro.encoder.\1_encoder.layernorm.\2",
-	r"encoder.fuse_lowres.(weight|bias)":                                       r"depth_pro.encoder.fuse_image_with_low_res.\1",
+    r"encoder.fuse_lowres.(weight|bias)":                                       r"depth_pro.encoder.fuse_image_with_low_res.\1",
     r"head.(\d+).(weight|bias)":                                                r"head.head.\1.\2",
 
     # fov
@@ -59,19 +59,19 @@
     r"fov.head.head.(\d+).(weight|bias)":                                       r"fov_model.head.\1.\2",
 
     # upsamples (hard coded; regex is not very feasible here)
-	"encoder.upsample_latent0.0.weight":                                        "depth_pro.encoder.upsample_intermediate.1.proj.weight",
-	"encoder.upsample_latent0.1.weight":                                        "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.0.weight",
-	"encoder.upsample_latent0.2.weight":                                        "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.1.weight",
-	"encoder.upsample_latent0.3.weight":                                        "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.2.weight",
-	"encoder.upsample_latent1.0.weight":                                        "depth_pro.encoder.upsample_intermediate.0.proj.weight",
-	"encoder.upsample_latent1.1.weight":                                        "depth_pro.encoder.upsample_intermediate.0.upsample_blocks.0.weight",
-	"encoder.upsample_latent1.2.weight":                                        "depth_pro.encoder.upsample_intermediate.0.upsample_blocks.1.weight",
-	"encoder.upsample0.0.weight":                                               "depth_pro.encoder.upsample_scaled_images.2.proj.weight",
-	"encoder.upsample0.1.weight":                                               "depth_pro.encoder.upsample_scaled_images.2.upsample_blocks.0.weight",
-	"encoder.upsample1.0.weight":                                               "depth_pro.encoder.upsample_scaled_images.1.proj.weight",
-	"encoder.upsample1.1.weight":                                               "depth_pro.encoder.upsample_scaled_images.1.upsample_blocks.0.weight",
-	"encoder.upsample2.0.weight":                                               "depth_pro.encoder.upsample_scaled_images.0.proj.weight",
-	"encoder.upsample2.1.weight":                                               "depth_pro.encoder.upsample_scaled_images.0.upsample_blocks.0.weight",
+    "encoder.upsample_latent0.0.weight":                                        "depth_pro.encoder.upsample_intermediate.1.proj.weight",
+    "encoder.upsample_latent0.1.weight":                                        "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.0.weight",
+    "encoder.upsample_latent0.2.weight":                                        "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.1.weight",
+    "encoder.upsample_latent0.3.weight":                                        "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.2.weight",
+    "encoder.upsample_latent1.0.weight":                                        "depth_pro.encoder.upsample_intermediate.0.proj.weight",
+    "encoder.upsample_latent1.1.weight":                                        "depth_pro.encoder.upsample_intermediate.0.upsample_blocks.0.weight",
+    "encoder.upsample_latent1.2.weight":                                        "depth_pro.encoder.upsample_intermediate.0.upsample_blocks.1.weight",
+    "encoder.upsample0.0.weight":                                               "depth_pro.encoder.upsample_scaled_images.2.proj.weight",
+    "encoder.upsample0.1.weight":                                               "depth_pro.encoder.upsample_scaled_images.2.upsample_blocks.0.weight",
+    "encoder.upsample1.0.weight":                                               "depth_pro.encoder.upsample_scaled_images.1.proj.weight",
+    "encoder.upsample1.1.weight":                                               "depth_pro.encoder.upsample_scaled_images.1.upsample_blocks.0.weight",
+    "encoder.upsample2.0.weight":                                               "depth_pro.encoder.upsample_scaled_images.0.proj.weight",
+    "encoder.upsample2.1.weight":                                               "depth_pro.encoder.upsample_scaled_images.0.upsample_blocks.0.weight",
     "encoder.upsample_lowres.weight":                                           "depth_pro.encoder.upsample_image.upsample_blocks.0.weight",
     "encoder.upsample_lowres.bias":                                             "depth_pro.encoder.upsample_image.upsample_blocks.0.bias",
 
diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py
index 99a7c26c9826..0e3c7d6455b0 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py
@@ -166,8 +166,8 @@ def resize(
             resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
                 `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
             antialias (`bool`, *optional*, defaults to `False`):
-				Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with
-				bilinear or bicubic modes and it is ignored otherwise.
+                Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with
+                bilinear or bicubic modes and it is ignored otherwise.
             data_format (`ChannelDimension` or `str`, *optional*):
                 The channel dimension format for the output image. If unset, the channel dimension format of the input
                 image is used. Can be one of:
@@ -260,8 +260,8 @@ def preprocess(
                 `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has
                 an effect if `do_resize` is set to `True`.
             antialias (`bool`, *optional*, defaults to `False`):
-				Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with
-				bilinear or bicubic modes and it is ignored otherwise.
+                Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with
+                bilinear or bicubic modes and it is ignored otherwise.
             do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
                 Whether to rescale the image values between [0 - 1].
             rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
@@ -352,7 +352,7 @@ def preprocess(
             to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
         ]
 
-		# depth-pro scales the image before resizing it
+        # depth-pro scales the image before resizing it
         # uses torch interpolation which requires ChannelDimension.FIRST
         if do_resize:
             images = self.resize(images, size=size_dict, resample=resample, antialias=antialias)
@@ -363,24 +363,36 @@ def preprocess(
 
     def post_process_depth_estimation(
         self,
-        predicted_depths,
-        fovs=None,
-        target_sizes=None,
-    ) -> List[Dict[str, TensorType]]:
+        predicted_depths: Union[TensorType, List[TensorType]],
+        fovs: Optional[Union[TensorType, List[TensorType], None]] = None,
+        target_sizes: Optional[Union[TensorType, List[Tuple[int, int]], None]] = None,
+    ) -> Dict[str, List[TensorType]]:
         """
-        Converts the raw output of [`DepthEstimatorOutput`] into final depth predictions and depth PIL images.
-        Only supports PyTorch.
+        Post-processes the raw depth predictions from the model to generate final depth predictions and optionally
+        resizes them to specified target sizes. This function supports scaling based on the field of view (FoV)
+        and adjusts depth values accordingly.
 
         Args:
-            outputs ([`DepthEstimatorOutput`]):
-                Raw outputs of the model.
-            target_sizes (`TensorType` or `List[Tuple[int, int]]`, *optional*):
-                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
-                (height, width) of each image in the batch. If left to None, predictions will not be resized.
+            predicted_depths (`Union[TensorType, List[TensorType]]`):
+                Raw depth predictions output by the model. Can be a single tensor or a list of tensors, each
+                corresponding to an image in the batch.
+            fovs (`Optional[Union[TensorType, List[TensorType], None]]`, *optional*, defaults to `None`):
+                Field of view (FoV) values corresponding to each depth prediction. Should have the same length
+                as `predicted_depths` if provided. If `None`, FoV scaling is skipped.
+            target_sizes (`Optional[Union[TensorType, List[Tuple[int, int]], None]]`, *optional*, defaults to `None`):
+                Target sizes to resize the depth predictions. Can be a tensor of shape `(batch_size, 2)` 
+                or a list of tuples `(height, width)` for each image in the batch. If `None`, no resizing
+                is performed.
 
         Returns:
-            `List[Dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth
-            predictions.
+            `Dict[str, List[TensorType]]`:
+                A dictionary containing:
+                    - `"predicted_depth"`: A list of processed depth tensors.
+                    - `"fov"`: A list of processed FoV values if provided, otherwise `None`.
+
+        Raises:
+            `ValueError`:
+                If the lengths of `predicted_depths`, `fovs`, or `target_sizes` are mismatched.
         """
         requires_backends(self, "torch")
 
diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
index 38d699452e44..3af05df3ccb8 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
@@ -15,7 +15,7 @@
 """Fast Image processor class for DepthPro."""
 
 import functools
-from typing import Dict, List, Optional, Union
+from typing import Dict, List, Optional, Union, Tuple
 
 from ...image_processing_base import BatchFeature
 from ...image_processing_utils import get_size_dict
@@ -308,24 +308,36 @@ def preprocess(
 
     def post_process_depth_estimation(
         self,
-        predicted_depths,
-        fovs=None,
-        target_sizes=None,
-    ) -> List[Dict[str, TensorType]]:
+        predicted_depths: Union[TensorType, List[TensorType]],
+        fovs: Optional[Union[TensorType, List[TensorType], None]] = None,
+        target_sizes: Optional[Union[TensorType, List[tuple[int, int]], None]] = None,
+    ) -> Dict[str, List[TensorType]]:
         """
-        Converts the raw output of [`DepthEstimatorOutput`] into final depth predictions and depth PIL images.
-        Only supports PyTorch.
+        Post-processes the raw depth predictions from the model to generate final depth predictions and optionally
+        resizes them to specified target sizes. This function supports scaling based on the field of view (FoV)
+        and adjusts depth values accordingly.
 
         Args:
-            outputs ([`DepthEstimatorOutput`]):
-                Raw outputs of the model.
-            target_sizes (`TensorType` or `List[Tuple[int, int]]`, *optional*):
-                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
-                (height, width) of each image in the batch. If left to None, predictions will not be resized.
+            predicted_depths (`Union[TensorType, List[TensorType]]`):
+                Raw depth predictions output by the model. Can be a single tensor or a list of tensors, each
+                corresponding to an image in the batch.
+            fovs (`Optional[Union[TensorType, List[TensorType], None]]`, *optional*, defaults to `None`):
+                Field of view (FoV) values corresponding to each depth prediction. Should have the same length
+                as `predicted_depths` if provided. If `None`, FoV scaling is skipped.
+            target_sizes (`Optional[Union[TensorType, List[tuple[int, int]], None]]`, *optional*, defaults to `None`):
+                Target sizes to resize the depth predictions. Can be a tensor of shape `(batch_size, 2)` 
+                or a list of tuples `(height, width)` for each image in the batch. If `None`, no resizing
+                is performed.
 
         Returns:
-            `List[Dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth
-            predictions.
+            `Dict[str, List[TensorType]]`:
+                A dictionary containing:
+                    - `"predicted_depth"`: A list of processed depth tensors.
+                    - `"fov"`: A list of processed FoV values if provided, otherwise `None`.
+
+        Raises:
+            `ValueError`:
+                If the lengths of `predicted_depths`, `fovs`, or `target_sizes` are mismatched.
         """
         requires_backends(self, "torch")
 
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index eb8bf02f83d1..b184b5985ba1 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -44,6 +44,13 @@
 
 logger = logging.get_logger(__name__)
 
+# General docstring
+_CONFIG_FOR_DOC = "DepthProConfig"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "geetu040/DepthPro"
+_EXPECTED_OUTPUT_SHAPE = [1, 577, 1024]
+
 
 # Copied from transformers.models.dinov2.modeling_dinov2.Dinov2PatchEmbeddings with Dinov2->DepthProViT
 class DepthProViTPatchEmbeddings(nn.Module):
@@ -942,7 +949,7 @@ def forward(
         # STEP 8: return these features in order of increasing size as what fusion expects
         last_hidden_state = [
             # (B, self.scaled_images_feature_dims[i], self.out_size*2**(i+1), self.out_size*2**(i+1))
-            *scaled_images_features, 
+            *scaled_images_features,
             # (B, config.intermediate_feature_dims[i], self.out_size*2**(self.n_scaled_images+i+1), self.out_size*2**(self.n_scaled_images+i+1))
             *intermediate_features,
         ]
@@ -1049,14 +1056,7 @@ class PreTrainedModel
             self.encoder.image_encoder.encoder.layer[layer].attention.prune_heads(heads)
 
     @add_start_docstrings_to_model_forward(DEPTH_PRO_INPUTS_DOCSTRING)
-    # TODO
-    # @add_code_sample_docstrings(
-    #     checkpoint=_CHECKPOINT_FOR_DOC,
-    #     output_type=BaseModelOutputWithPoolingAndIntermediateActivations,
-    #     config_class=_CONFIG_FOR_DOC,
-    #     modality="vision",
-    #     expected_output=_EXPECTED_OUTPUT_SHAPE,
-    # )
+    @replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         pixel_values: torch.FloatTensor,
@@ -1065,6 +1065,13 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Returns:
+
+        Examples:
+        TODO
+        ```python
+        ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1399,7 +1406,7 @@ def __init__(self, config, use_fov_model=None):
 
 
     @add_start_docstrings_to_model_forward(DEPTH_PRO_INPUTS_DOCSTRING)
-    # @replace_return_docstrings(output_type=DepthEstimatorOutput, config_class=_CONFIG_FOR_DOC)
+    @replace_return_docstrings(output_type=DepthProDepthEstimatorOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         pixel_values: torch.FloatTensor,
@@ -1418,37 +1425,6 @@ def forward(
         Examples:
         TODO
         ```python
-        >>> from transformers import AutoImageProcessor, DPTForDepthEstimation
-        >>> import torch
-        >>> import numpy as np
-        >>> from PIL import Image
-        >>> import requests
-
-        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
-
-        >>> image_processor = AutoImageProcessor.from_pretrained("Intel/dpt-large")
-        >>> model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large")
-
-        >>> # prepare image for the model
-        >>> inputs = image_processor(images=image, return_tensors="pt")
-
-        >>> with torch.no_grad():
-        ...     outputs = model(**inputs)
-        ...     predicted_depth = outputs.predicted_depth
-
-        >>> # interpolate to original size
-        >>> prediction = torch.nn.functional.interpolate(
-        ...     predicted_depth.unsqueeze(1),
-        ...     size=image.size[::-1],
-        ...     mode="bicubic",
-        ...     align_corners=False,
-        ... )
-
-        >>> # visualize the prediction
-        >>> output = prediction.squeeze().cpu().numpy()
-        >>> formatted = (output * 255 / np.max(output)).astype("uint8")
-        >>> depth = Image.fromarray(formatted)
         ```"""
         loss = None
         if labels is not None:

From d0a8733f275941adb827a4f7e3850c2a28d66006 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 27 Nov 2024 19:25:43 +0500
Subject: [PATCH 031/151] more formatting

---
 .../models/depth_pro/image_processing_depth_pro.py         | 7 +++----
 src/transformers/models/depth_pro/modeling_depth_pro.py    | 7 +------
 2 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py
index 0e3c7d6455b0..21810bfab645 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py
@@ -15,14 +15,13 @@
 """Image processor class for DepthPro."""
 
 from typing import Dict, List, Optional, Union
-from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
 
 import numpy as np
-from icecream import ic
 
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-from ...image_transforms import resize, to_channel_dimension_format
+from ...image_transforms import to_channel_dimension_format
 from ...image_utils import (
     IMAGENET_STANDARD_MEAN,
     IMAGENET_STANDARD_STD,
@@ -39,7 +38,7 @@
 from ...utils import TensorType, filter_out_non_signature_kwargs, logging
 
 import math
-from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
 
 
 if TYPE_CHECKING:
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index b184b5985ba1..3812f678b43f 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -14,23 +14,18 @@
 # limitations under the License.
 """PyTorch DepthPro model."""
 
-from icecream import ic
-
-import collections.abc
 import math
-from typing import Dict, List, Optional, Set, Tuple, Union
+from typing import List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
 from dataclasses import dataclass
 
-from ...utils import ModelOutput
 from ...activations import ACT2FN
 from ...modeling_outputs import (
     BaseModelOutput, DepthEstimatorOutput
 )
 from ...utils import (
-    add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     logging,

From e6b385a9edf92a5c7f342935d75ae3e017fe122c Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 27 Nov 2024 19:45:20 +0500
Subject: [PATCH 032/151] formatting with ruff

---
 .../convert_depth_pro_weights_to_hf.py        |  6 +--
 .../depth_pro/image_processing_depth_pro.py   | 39 ++-----------------
 .../image_processing_depth_pro_fast.py        |  5 ++-
 .../models/depth_pro/modeling_depth_pro.py    | 10 ++---
 4 files changed, 13 insertions(+), 47 deletions(-)

diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
index c3b77f17f04c..66dfff12065a 100644
--- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
+++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
@@ -19,13 +19,13 @@
 import regex as re
 import torch
 from huggingface_hub import hf_hub_download
-from transformers.image_utils import PILImageResampling
 
 from transformers import (
     DepthProConfig,
-    DepthProImageProcessorFast,
     DepthProForDepthEstimation,
+    DepthProImageProcessorFast,
 )
+from transformers.image_utils import PILImageResampling
 
 
 # fmt: off
@@ -126,7 +126,7 @@ def get_qkv_state_dict(key, parameter):
     )
     for replacement_key, replacement_val in zip(replacements_keys, replacements_vals):
         qkv_state_dict[key.replace(placeholder, replacement_key)] = replacement_val
-    return qkv_state_dict        
+    return qkv_state_dict
 
 def write_model(
     hf_repo_id: str,
diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py
index 21810bfab645..6c9c7f94e226 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py
@@ -14,12 +14,10 @@
 # limitations under the License.
 """Image processor class for DepthPro."""
 
-from typing import Dict, List, Optional, Union
-from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union
 
 import numpy as np
 
-
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
 from ...image_transforms import to_channel_dimension_format
 from ...image_utils import (
@@ -30,43 +28,15 @@
     PILImageResampling,
     infer_channel_dimension_format,
     is_scaled_image,
-    make_list_of_images,
-    to_numpy_array,
-    valid_images,
-    pil_torch_interpolation_mapping,
-)
-from ...utils import TensorType, filter_out_non_signature_kwargs, logging
-
-import math
-from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
-
-
-if TYPE_CHECKING:
-    from ...modeling_outputs import DepthEstimatorOutput
-
-import numpy as np
-
-from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-from ...image_transforms import pad, resize, to_channel_dimension_format
-from ...image_utils import (
-    IMAGENET_STANDARD_MEAN,
-    IMAGENET_STANDARD_STD,
-    ChannelDimension,
-    ImageInput,
-    PILImageResampling,
-    get_image_size,
-    infer_channel_dimension_format,
-    is_scaled_image,
     is_torch_available,
-    is_torch_tensor,
     make_list_of_images,
+    pil_torch_interpolation_mapping,
     to_numpy_array,
     valid_images,
 )
 from ...utils import (
     TensorType,
     filter_out_non_signature_kwargs,
-    is_vision_available,
     logging,
     requires_backends,
 )
@@ -75,9 +45,6 @@
 if is_torch_available():
     import torch
 
-if is_vision_available():
-    import PIL
-
 
 logger = logging.get_logger(__name__)
 
@@ -379,7 +346,7 @@ def post_process_depth_estimation(
                 Field of view (FoV) values corresponding to each depth prediction. Should have the same length
                 as `predicted_depths` if provided. If `None`, FoV scaling is skipped.
             target_sizes (`Optional[Union[TensorType, List[Tuple[int, int]], None]]`, *optional*, defaults to `None`):
-                Target sizes to resize the depth predictions. Can be a tensor of shape `(batch_size, 2)` 
+                Target sizes to resize the depth predictions. Can be a tensor of shape `(batch_size, 2)`
                 or a list of tuples `(height, width)` for each image in the batch. If `None`, no resizing
                 is performed.
 
diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
index 3af05df3ccb8..46b502d7d26f 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
@@ -15,7 +15,7 @@
 """Fast Image processor class for DepthPro."""
 
 import functools
-from typing import Dict, List, Optional, Union, Tuple
+from typing import Dict, List, Optional, Union
 
 from ...image_processing_base import BatchFeature
 from ...image_processing_utils import get_size_dict
@@ -35,6 +35,7 @@
 from ...utils import TensorType, logging, requires_backends
 from ...utils.import_utils import is_torch_available, is_torchvision_available
 
+
 logger = logging.get_logger(__name__)
 
 
@@ -325,7 +326,7 @@ def post_process_depth_estimation(
                 Field of view (FoV) values corresponding to each depth prediction. Should have the same length
                 as `predicted_depths` if provided. If `None`, FoV scaling is skipped.
             target_sizes (`Optional[Union[TensorType, List[tuple[int, int]], None]]`, *optional*, defaults to `None`):
-                Target sizes to resize the depth predictions. Can be a tensor of shape `(batch_size, 2)` 
+                Target sizes to resize the depth predictions. Can be a tensor of shape `(batch_size, 2)`
                 or a list of tuples `(height, width)` for each image in the batch. If `None`, no resizing
                 is performed.
 
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 3812f678b43f..5b521cfda9bd 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -15,16 +15,16 @@
 """PyTorch DepthPro model."""
 
 import math
+from dataclasses import dataclass
 from typing import List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
-from dataclasses import dataclass
 
 from ...activations import ACT2FN
-from ...modeling_outputs import (
-    BaseModelOutput, DepthEstimatorOutput
-)
+from ...modeling_outputs import BaseModelOutput, DepthEstimatorOutput
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -32,8 +32,6 @@
     replace_return_docstrings,
     torch_int,
 )
-from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
 from .configuration_depth_pro import DepthProConfig
 
 

From 267e50fbe2288de71428776adebaea51b902751c Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 27 Nov 2024 19:46:50 +0500
Subject: [PATCH 033/151] formatting with style

---
 src/transformers/__init__.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 3d0b85e3a1b4..0e6c48762a85 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -5262,6 +5262,7 @@
         XLMProphetNetConfig,
     )
     from .models.depth_anything import DepthAnythingConfig
+    from .models.depth_pro import DepthProConfig
     from .models.detr import DetrConfig
     from .models.dinat import DinatConfig
     from .models.dinov2 import Dinov2Config
@@ -5281,7 +5282,6 @@
         DPRReaderTokenizer,
     )
     from .models.dpt import DPTConfig
-    from .models.depth_pro import DepthProConfig
     from .models.efficientnet import (
         EfficientNetConfig,
     )
@@ -6107,10 +6107,10 @@
         from .models.deprecated.efficientformer import EfficientFormerImageProcessor
         from .models.deprecated.tvlt import TvltImageProcessor
         from .models.deprecated.vit_hybrid import ViTHybridImageProcessor
+        from .models.depth_pro import DepthProImageProcessor, DepthProImageProcessorFast
         from .models.detr import DetrFeatureExtractor, DetrImageProcessor, DetrImageProcessorFast
         from .models.donut import DonutFeatureExtractor, DonutImageProcessor
         from .models.dpt import DPTFeatureExtractor, DPTImageProcessor
-        from .models.depth_pro import DepthProImageProcessor, DepthProImageProcessorFast
         from .models.efficientnet import EfficientNetImageProcessor
         from .models.flava import (
             FlavaFeatureExtractor,
@@ -6872,6 +6872,11 @@
             DepthAnythingForDepthEstimation,
             DepthAnythingPreTrainedModel,
         )
+        from .models.depth_pro import (
+            DepthProForDepthEstimation,
+            DepthProModel,
+            DepthProPreTrainedModel,
+        )
         from .models.detr import (
             DetrForObjectDetection,
             DetrForSegmentation,
@@ -6918,11 +6923,6 @@
             DPTModel,
             DPTPreTrainedModel,
         )
-        from .models.depth_pro import (
-            DepthProForDepthEstimation,
-            DepthProModel,
-            DepthProPreTrainedModel,
-        )
         from .models.efficientnet import (
             EfficientNetForImageClassification,
             EfficientNetModel,

From a1ec99743563ae054ae159a7d83dc76e9c09a4ab Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Thu, 28 Nov 2024 00:48:06 +0500
Subject: [PATCH 034/151] fix copied classes

---
 .../depth_pro/configuration_depth_pro.py      |  48 ++--
 .../convert_depth_pro_weights_to_hf.py        |  44 ++--
 .../depth_pro/image_processing_depth_pro.py   |   9 +-
 .../image_processing_depth_pro_fast.py        |   9 +-
 .../models/depth_pro/modeling_depth_pro.py    | 225 ++++++++++--------
 5 files changed, 174 insertions(+), 161 deletions(-)

diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py
index 8bab8227be7e..d938f0a721f1 100644
--- a/src/transformers/models/depth_pro/configuration_depth_pro.py
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@@ -78,22 +78,22 @@ class DepthProConfig(PretrainedConfig):
             Whether to reshape the feature maps to 4D tensors of shape `(batch_size, hidden_size, height, width)` in
             case the model is used as backbone. If `False`, the feature maps will be 3D tensors of shape `(batch_size,
             seq_len, hidden_size)`.
-        intermediate_hook_ids
-            Indices of the intermediate hidden states from patch_encoder to use for fusion.
-        intermediate_feature_dims
-            Hidden state during upsampling for each intermediate hidden states in intermediate_hook_ids.
-        scaled_images_ratios
-            Use images of these ratios for patch_encoder.
-        scaled_images_overlap_ratios
-            Overlap ratio between patches for each scaled image in scaled_image_ratios.
-        scaled_images_feature_dims
-            Hidden state during upsampling for each scaled image in scaled_images_ratios.
-        use_batch_norm_in_fusion
-            Whether to use batch normalization in the residual units of the fusion blocks.
-        use_fov_model
-            Whether to use `DepthProFOVModel` to generate Field of View.
-        num_fov_head_layers
-            No of convolution layers in head of `DepthProFOVModel`.
+        intermediate_hook_ids (`List[int]`, *optional*, defaults to `[11, 5]`):
+            Indices of the intermediate hidden states from the patch encoder to use for fusion.
+        intermediate_feature_dims (`List[int]`, *optional*, defaults to `[256, 256]`):
+            Hidden state dimensions during upsampling for each intermediate hidden state in `intermediate_hook_ids`.
+        scaled_images_ratios (`List[float]`, *optional*, defaults to `[0.25, 0.5, 1]`):
+            Ratios of scaled images to be used by the patch encoder.
+        scaled_images_overlap_ratios (`List[float]`, *optional*, defaults to `[0.0, 0.5, 0.25]`):
+            Overlap ratios between patches for each scaled image in `scaled_images_ratios`.
+        scaled_images_feature_dims (`List[int]`, *optional*, defaults to `[1024, 1024, 512]`):
+            Hidden state dimensions during upsampling for each scaled image in `scaled_images_ratios`.
+        use_batch_norm_in_fusion_residual (`bool`, *optional*, defaults to `False`):
+            Whether to use batch normalization in the pre-activate residual units of the fusion blocks.
+        use_fov_model (`bool`, *optional*, defaults to `True`):
+            Whether to use `DepthProFOVModel` to generate the field of view.
+        num_fov_head_layers (`int`, *optional*, defaults to `2`):
+            Number of convolution layers in the head of `DepthProFOVModel`.
 
     Example:
 
@@ -134,12 +134,13 @@ def __init__(
         use_swiglu_ffn=False,
         apply_layernorm=True,
         reshape_hidden_states=True,
-        intermediate_hook_ids = [11, 5],
-        intermediate_feature_dims = [256, 256],
-        scaled_images_ratios = [0.25, 0.5, 1],
-        scaled_images_overlap_ratios = [0.0, 0.5, 0.25],
-        scaled_images_feature_dims = [1024, 1024, 512],
-        use_batch_norm_in_fusion=False,
+        intermediate_hook_ids=[11, 5],
+        intermediate_feature_dims=[256, 256],
+        scaled_images_ratios=[0.25, 0.5, 1],
+        scaled_images_overlap_ratios=[0.0, 0.5, 0.25],
+        scaled_images_feature_dims=[1024, 1024, 512],
+        use_batch_norm_in_fusion_residual=False,
+        use_bias_in_fusion_residual=True,
         use_fov_model=True,
         num_fov_head_layers=2,
         **kwargs,
@@ -166,7 +167,8 @@ def __init__(
         self.use_swiglu_ffn = use_swiglu_ffn
         self.apply_layernorm = apply_layernorm
         self.reshape_hidden_states = reshape_hidden_states
-        self.use_batch_norm_in_fusion = use_batch_norm_in_fusion
+        self.use_batch_norm_in_fusion_residual = use_batch_norm_in_fusion_residual
+        self.use_bias_in_fusion_residual = use_bias_in_fusion_residual
         self.use_fov_model = use_fov_model
         self.num_fov_head_layers = num_fov_head_layers
         self.intermediate_hook_ids = intermediate_hook_ids
diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
index 66dfff12065a..377595b746ac 100644
--- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
+++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
@@ -93,6 +93,7 @@
 }
 # fmt: on
 
+
 def convert_old_keys_to_new_keys(state_dict_keys: dict = None):
     output_dict = {}
     if state_dict_keys is not None:
@@ -106,6 +107,7 @@ def convert_old_keys_to_new_keys(state_dict_keys: dict = None):
         output_dict = dict(zip(old_text.split("\n"), new_text.split("\n")))
     return output_dict
 
+
 def get_qkv_state_dict(key, parameter):
     """
     new key which looks like this
@@ -117,21 +119,20 @@ def get_qkv_state_dict(key, parameter):
     xxxx.v.xxxx         (m//3, n)
     """
     qkv_state_dict = {}
-    placeholder = re.search(r'(\(.*?\))', key).group(1) # finds   "(query|key|value)"
-    replacements_keys = placeholder[1:-1].split("|")    # creates ['query', 'key', 'value']
+    placeholder = re.search(r"(\(.*?\))", key).group(1)  # finds   "(query|key|value)"
+    replacements_keys = placeholder[1:-1].split("|")  # creates ['query', 'key', 'value']
     replacements_vals = torch.split(
-        parameter,
-        split_size_or_sections=parameter.size(0)//len(replacements_keys),
-        dim=0
+        parameter, split_size_or_sections=parameter.size(0) // len(replacements_keys), dim=0
     )
     for replacement_key, replacement_val in zip(replacements_keys, replacements_vals):
         qkv_state_dict[key.replace(placeholder, replacement_key)] = replacement_val
     return qkv_state_dict
 
+
 def write_model(
     hf_repo_id: str,
     output_dir: str,
-    safe_serialization: bool=True,
+    safe_serialization: bool = True,
 ):
     os.makedirs(output_dir, exist_ok=True)
 
@@ -162,11 +163,11 @@ def write_model(
         use_swiglu_ffn=False,
         apply_layernorm=True,
         reshape_hidden_states=True,
-        intermediate_hook_ids = [11, 5],
-        intermediate_feature_dims = [256, 256],
-        scaled_images_ratios = [0.25, 0.5, 1],
-        scaled_images_overlap_ratios = [0.0, 0.5, 0.25],
-        scaled_images_feature_dims = [1024, 1024, 512],
+        intermediate_hook_ids=[11, 5],
+        intermediate_feature_dims=[256, 256],
+        scaled_images_ratios=[0.25, 0.5, 1],
+        scaled_images_overlap_ratios=[0.0, 0.5, 0.25],
+        scaled_images_feature_dims=[1024, 1024, 512],
         use_batch_norm_in_fusion=False,
         use_fov_model=True,
         num_fov_head_layers=2,
@@ -215,18 +216,19 @@ def write_model(
     DepthProForDepthEstimation.from_pretrained(output_dir, torch_dtype=torch.bfloat16, device_map="auto")
     print("Model reloaded successfully.")
 
+
 def write_image_processor(output_dir: str):
     image_processor = DepthProImageProcessorFast(
-        do_resize = True,
-        size = {"height": 1536, "width": 1536},
-        resample = PILImageResampling.BILINEAR,
-        antialias = False,
-        do_rescale = True,
-        rescale_factor = 1 / 255,
-        do_normalize = True,
-        image_mean = 0.5,
-        image_std = 0.5,
-        return_tensors = "pt",
+        do_resize=True,
+        size={"height": 1536, "width": 1536},
+        resample=PILImageResampling.BILINEAR,
+        antialias=False,
+        do_rescale=True,
+        rescale_factor=1 / 255,
+        do_normalize=True,
+        image_mean=0.5,
+        image_std=0.5,
+        return_tensors="pt",
     )
     image_processor.save_pretrained(output_dir)
 
diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py
index 6c9c7f94e226..15a33f804d14 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py
@@ -371,18 +371,13 @@ def post_process_depth_estimation(
                 "Make sure that you pass in as many fov values as the batch dimension of the predicted depth"
             )
 
-        outputs = {
-            "predicted_depth": [],
-            "fov": [] if fovs is not None else None
-        }
+        outputs = {"predicted_depth": [], "fov": [] if fovs is not None else None}
 
         fovs = [None] * len(predicted_depths) if fovs is None else fovs
         target_sizes = [None] * len(predicted_depths) if target_sizes is None else target_sizes
 
         for predicted_depth, fov, target_size in zip(predicted_depths, fovs, target_sizes):
-
             if target_size is not None:
-
                 # scale image w.r.t fov
                 if fov is not None:
                     width = target_size[1]
@@ -395,7 +390,7 @@ def post_process_depth_estimation(
                     predicted_depth.unsqueeze(0).unsqueeze(1),
                     size=target_size,
                     resample=self.resample,
-                    antialias=self.antialias
+                    antialias=self.antialias,
                 ).squeeze()
 
             # inverse the depth
diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
index 46b502d7d26f..374d5c25cafc 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
@@ -161,7 +161,7 @@ def _build_transforms(
                 Resize(
                     (size["height"], size["width"]),
                     interpolation=pil_torch_interpolation_mapping[resample],
-                    antialias=antialias
+                    antialias=antialias,
                 )
             )
 
@@ -351,18 +351,13 @@ def post_process_depth_estimation(
                 "Make sure that you pass in as many fov values as the batch dimension of the predicted depth"
             )
 
-        outputs = {
-            "predicted_depth": [],
-            "fov": [] if fovs is not None else None
-        }
+        outputs = {"predicted_depth": [], "fov": [] if fovs is not None else None}
 
         fovs = [None] * len(predicted_depths) if fovs is None else fovs
         target_sizes = [None] * len(predicted_depths) if target_sizes is None else target_sizes
 
         for predicted_depth, fov, target_size in zip(predicted_depths, fovs, target_sizes):
-
             if target_size is not None:
-
                 # scale image w.r.t fov
                 if fov is not None:
                     width = target_size[1]
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 5b521cfda9bd..77983933a19a 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -40,17 +40,11 @@
 # General docstring
 _CONFIG_FOR_DOC = "DepthProConfig"
 
-# Base docstring
-_CHECKPOINT_FOR_DOC = "geetu040/DepthPro"
-_EXPECTED_OUTPUT_SHAPE = [1, 577, 1024]
 
-
-# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2PatchEmbeddings with Dinov2->DepthProViT
 class DepthProViTPatchEmbeddings(nn.Module):
     """
-    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
-    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
-    Transformer.
+    Copied from transformers.models.dinov2.modeling_dinov2.Dinov2PatchEmbeddings
+    with addition of config parameter patch_embeddings_size
     """
 
     def __init__(self, config):
@@ -60,6 +54,7 @@ def __init__(self, config):
         self.in_channels = config.num_channels
         self.out_channels = config.hidden_size
         self.patch_embeddings_size = config.patch_embeddings_size
+        self.num_channels = config.num_channels
 
         self.projection = nn.Conv2d(
             self.in_channels,
@@ -68,9 +63,10 @@ def __init__(self, config):
             stride=(self.patch_embeddings_size, self.patch_embeddings_size),
         )
 
+    # Copied from transformers.models.dinov2.modeling_dinov2.Dinov2PatchEmbeddings.forward
     def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
         num_channels = pixel_values.shape[1]
-        if num_channels != self.config.num_channels:
+        if num_channels != self.num_channels:
             raise ValueError(
                 "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
                 f" Expected {self.num_channels} but got {num_channels}."
@@ -79,11 +75,10 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
         return embeddings
 
 
-# Copied from transformers.models.dinov2.modeling_dinov2.DepthProViTEmbeddings
-# with DepthProViT->DepthProViT and antialias=True in interpolation
 class DepthProViTEmbeddings(nn.Module):
     """
-    Construct the CLS token, position and patch embeddings.
+    Copied from transformers.models.dinov2.modeling_dinov2.Dinov2Embeddings
+    except antialias=True in interpolation and removal of mask_token
     """
 
     def __init__(self, config: DepthProConfig) -> None:
@@ -131,7 +126,7 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width:
             size=(new_height, new_width),
             mode="bicubic",
             align_corners=False,
-            antialias=True, # except for this, the class is same as transformers.models.dinov2.modeling_dinov2.DepthProPatchEmbeddings
+            antialias=True,  # except for this, the class is same as transformers.models.dinov2.modeling_dinov2.DepthProViTPatchEmbeddings
         ).to(dtype=target_dtype)
 
         patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
@@ -155,7 +150,7 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
         return embeddings
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->DepthProViT
+# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->DepthPro
 class DepthProViTSelfAttention(nn.Module):
     def __init__(self, config: DepthProConfig) -> None:
         super().__init__()
@@ -216,7 +211,7 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2SelfAttention with Dinov2->DepthProViT
+# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2SdpaSelfAttention with Dinov2Config->DepthProConfig, Dinov2->DepthProViT
 class DepthProViTSdpaSelfAttention(DepthProViTSelfAttention):
     def __init__(self, config: DepthProConfig) -> None:
         super().__init__(config)
@@ -226,8 +221,9 @@ def forward(
         self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
     ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
         if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
             logger.warning_once(
-                "DepthProModel is using DepthProViTSdpaSelfAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                "DepthProViTModel is using DepthProViTSdpaSelfAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
                 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
             )
             return super().forward(
@@ -257,7 +253,7 @@ def forward(
         return context_layer, None
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->DepthProViT
+# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViTConfig->DepthProConfig, ViT->DepthProViT
 class DepthProViTSelfOutput(nn.Module):
     """
     The residual connection is defined in DepthProViTLayer instead of here (as is the case with other models), due to the
@@ -276,7 +272,7 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         return hidden_states
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViT->DepthProViT
+# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViTConfig->DepthProConfig, ViT->DepthProViT
 class DepthProViTAttention(nn.Module):
     def __init__(self, config: DepthProConfig) -> None:
         super().__init__()
@@ -316,14 +312,14 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTSdpaAttention with ViT->DepthProViT
+# Copied from transformers.models.vit.modeling_vit.ViTSdpaAttention with ViTConfig->DepthProConfig, ViT->DepthProViT
 class DepthProViTSdpaAttention(DepthProViTAttention):
     def __init__(self, config: DepthProConfig) -> None:
         super().__init__(config)
         self.attention = DepthProViTSdpaSelfAttention(config)
 
 
-# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2SdpaAttention with Dinov2->DepthProViT
+# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2LayerScale with Dinov2Config->DepthProConfig, Dinov2->DepthProViT
 class DepthProViTLayerScale(nn.Module):
     def __init__(self, config) -> None:
         super().__init__()
@@ -369,7 +365,7 @@ def extra_repr(self) -> str:
         return "p={}".format(self.drop_prob)
 
 
-# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2MLP with Dinov2->DepthProViT
+# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2MLP with Dinov2->DepthPro
 class DepthProViTMLP(nn.Module):
     def __init__(self, config) -> None:
         super().__init__()
@@ -389,7 +385,7 @@ def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
         return hidden_state
 
 
-# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2SwiGLUFFN with Dinov2->DepthProViT
+# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2SwiGLUFFN with Dinov2->DepthPro
 class DepthProViTSwiGLUFFN(nn.Module):
     def __init__(self, config) -> None:
         super().__init__()
@@ -413,7 +409,7 @@ def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
 }
 
 
-# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2Layer with Dinov2->DepthProViT
+# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2Layer with Dinov2Config->DepthProConfig, Dinov2->DepthProViT all-casing
 class DepthProViTLayer(nn.Module):
     """This corresponds to the Block class in the original implementation."""
 
@@ -465,7 +461,7 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViT->DepthProViT
+# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViTConfig->DepthProConfig, ViT->DepthProViT
 class DepthProViTEncoder(nn.Module):
     def __init__(self, config: DepthProConfig) -> None:
         super().__init__()
@@ -569,14 +565,14 @@ def forward(
 
 class DepthProUpsampleBlock(nn.Module):
     def __init__(
-            self,
-            input_dims,
-            intermediate_dims,
-            output_dims,
-            n_upsample_layers,
-            use_proj=True,
-            bias=False,
-        ) -> None:
+        self,
+        input_dims,
+        intermediate_dims,
+        output_dims,
+        n_upsample_layers,
+        use_proj=True,
+        bias=False,
+    ) -> None:
         super().__init__()
 
         # create first projection block
@@ -620,6 +616,7 @@ def interpolate(pixel_values, scale_factor):
         align_corners=False,
     )
 
+
 def patch(pixel_values, patch_size, overlap_ratio):
     """Creates Patches from Batch."""
     B, C, W, H = pixel_values.shape
@@ -631,9 +628,7 @@ def patch(pixel_values, patch_size, overlap_ratio):
     stride = int(patch_size * (1 - overlap_ratio))
 
     # (B, C, W, H)
-    patches = torch.nn.functional.unfold(
-        pixel_values, kernel_size=(patch_size, patch_size), stride=(stride, stride)
-    )
+    patches = torch.nn.functional.unfold(pixel_values, kernel_size=(patch_size, patch_size), stride=(stride, stride))
     # patches.shape (B, patch_size**2 * C, num_patches)
     patches = patches.permute(2, 0, 1)
     # patches.shape (num_patches, B, patch_size**2 * C)
@@ -642,11 +637,12 @@ def patch(pixel_values, patch_size, overlap_ratio):
 
     return patches
 
+
 def reshape_feature(hidden_states, width, height):
     """Discard class token and reshape 1D feature map to a 2D grid."""
     B, _, C = hidden_states.shape
     # (B, WH+1, C)
-    hidden_states = hidden_states[:, 1:, :] # remove class token
+    hidden_states = hidden_states[:, 1:, :]  # remove class token
     # (B, WH, C)
     hidden_states = hidden_states.reshape(B, width, height, C)
     # (B, W, H, C)
@@ -654,6 +650,7 @@ def reshape_feature(hidden_states, width, height):
     # (B, C, W, H)
     return hidden_states
 
+
 def merge(patches, batch_size, merge_out_size):
     """Recreates Batch from Patches."""
     num_patches, num_channels, out_size, out_size = patches.shape
@@ -668,7 +665,7 @@ def merge(patches, batch_size, merge_out_size):
     merge_out_size = (box_size - 2) * (out_size - 2 * padding) + (2) * (out_size - padding)
     padding = (merge_out_size - box_size * out_size) / (6 - 2 * box_size)
     """
-    padding = ( box_size * out_size - merge_out_size ) // ( 2 * box_size - 2 )
+    padding = (box_size * out_size - merge_out_size) // (2 * box_size - 2)
 
     i = 0
     boxes = []
@@ -685,10 +682,10 @@ def merge(patches, batch_size, merge_out_size):
                 box = box[..., :, padding:]
             if h != box_size - 1:
                 # remove pad from height if box is not at bottom border
-                box = box[..., :box.shape[-2]-padding, :]
+                box = box[..., : box.shape[-2] - padding, :]
             if w != box_size - 1:
                 # remove pad from width if box is not at right border
-                box = box[..., :, :box.shape[-1]-padding]
+                box = box[..., :, : box.shape[-1] - padding]
 
             boxes_in_row.append(box)
             i += 1
@@ -717,13 +714,12 @@ def __init__(self, config: DepthProConfig) -> None:
         self.n_scaled_images = len(self.scaled_images_ratios)
         self.n_intermediate_hooks = len(self.intermediate_hook_ids)
         self.out_size = config.patch_size // config.patch_embeddings_size
-        self.seq_len = self.out_size ** 2 # each patch is flattened
+        self.seq_len = self.out_size**2  # each patch is flattened
 
         # config.scaled_images_ratios is sorted
         if config.scaled_images_ratios != sorted(config.scaled_images_ratios):
             raise ValueError(
-                f"Values in scaled_images_ratios={config.scaled_images_ratios} "
-                "should be sorted from low to high"
+                f"Values in scaled_images_ratios={config.scaled_images_ratios} " "should be sorted from low to high"
             )
 
         # lowest image resolution is greator than the patch_size
@@ -767,7 +763,7 @@ def __init__(self, config: DepthProConfig) -> None:
                 input_dims=config.hidden_size,
                 intermediate_dims=intermediate_dims,
                 output_dims=feature_dims,
-                n_upsample_layers=2+i,
+                n_upsample_layers=2 + i,
             )
             self.upsample_intermediate.append(upsample_block)
 
@@ -783,7 +779,7 @@ def __init__(self, config: DepthProConfig) -> None:
 
         # for STEP 7: fuse low_res and image features
         self.fuse_image_with_low_res = nn.Conv2d(
-            in_channels=config.scaled_images_feature_dims[0]*2,
+            in_channels=config.scaled_images_feature_dims[0] * 2,
             out_channels=config.scaled_images_feature_dims[0],
             kernel_size=1,
             stride=1,
@@ -838,7 +834,7 @@ def forward(
                 overlap_ratio=self.scaled_images_overlap_ratios[i],
             )
         scaled_images_num_patches = [len(i) for i in scaled_images]
-        patches = torch.cat(scaled_images[::-1], dim=0) # -1 as patch encoder expects high res patches first
+        patches = torch.cat(scaled_images[::-1], dim=0)  # -1 as patch encoder expects high res patches first
         # (sum(scaled_images_num_patches), config.num_channels, config.patch_size, config.patch_size)
 
         # STEP 3: apply patch and image encoder
@@ -847,16 +843,15 @@ def forward(
             patches,
             head_mask=head_mask,
             output_attentions=output_attentions,
-            output_hidden_states=True, # required for intermediate features
+            output_hidden_states=True,  # required for intermediate features
             return_dict=True,
         )
         scaled_images_last_hidden_state = torch.split_with_sizes(
-            patch_encodings.last_hidden_state,
-            scaled_images_num_patches[::-1]
-        )[::-1] # -1 as patch encoder expects high res patches first
+            patch_encodings.last_hidden_state, scaled_images_num_patches[::-1]
+        )[::-1]  # -1 as patch encoder expects high res patches first
 
         image_encodings = self.image_encoder(
-            pixel_values=scaled_images[0], # provide least resolution image
+            pixel_values=scaled_images[0],  # provide least resolution image
             head_mask=head_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
@@ -874,12 +869,12 @@ def forward(
             # b. reshape back to image like
             features = reshape_feature(
                 hidden_state, self.out_size, self.out_size
-            ) # (scaled_images_num_patches[i], config.num_channels, self.out_size, self.out_size)
+            )  # (scaled_images_num_patches[i], config.num_channels, self.out_size, self.out_size)
 
             # c. merge patches back together
             features = merge(
-                features, batch_size=B, merge_out_size=self.out_size*2**i
-            ) # (B, config.hidden_size, self.out_size*2**i, self.out_size*2**i)
+                features, batch_size=B, merge_out_size=self.out_size * 2**i
+            )  # (B, config.hidden_size, self.out_size*2**i, self.out_size*2**i)
 
             # d. upsample
             features = self.upsample_scaled_images[i](features)
@@ -891,11 +886,14 @@ def forward(
 
         intermediate_features = []
         for i in range(self.n_intermediate_hooks):
-
             # a. extract hidden_state
-            layer_id = self.intermediate_hook_ids[i] + 1 # +1 to correct index position as hidden_states contain embedding output as well
+            layer_id = (
+                self.intermediate_hook_ids[i] + 1
+            )  # +1 to correct index position as hidden_states contain embedding output as well
             hidden_state = patch_encodings.hidden_states[layer_id]
-            hidden_state = hidden_state[:scaled_images_num_patches[-1]] # num_patches to be of same length as highest resolution
+            hidden_state = hidden_state[
+                : scaled_images_num_patches[-1]
+            ]  # num_patches to be of same length as highest resolution
             # (scaled_images_num_patches[-1], self.seq_len+1, config.hidden_size)
 
             # b. reshape back to image like
@@ -903,12 +901,14 @@ def forward(
                 hidden_state,
                 self.out_size,
                 self.out_size,
-            ) # (scaled_images_num_patches[-1], config.hidden_size, self.out_size, self.out_size)
+            )  # (scaled_images_num_patches[-1], config.hidden_size, self.out_size, self.out_size)
 
             # c. merge patches back together
             features = merge(
-                features, batch_size=B, merge_out_size=self.out_size*2**(self.n_scaled_images-1),
-            ) # (B, config.hidden_size, self.out_size*2**(self.n_scaled_images-1), self.out_size*2**(self.n_scaled_images-1))
+                features,
+                batch_size=B,
+                merge_out_size=self.out_size * 2 ** (self.n_scaled_images - 1),
+            )  # (B, config.hidden_size, self.out_size*2**(self.n_scaled_images-1), self.out_size*2**(self.n_scaled_images-1))
 
             # d. upsample
             features = self.upsample_intermediate[i](features)
@@ -919,20 +919,26 @@ def forward(
         # STEP 6: get image features - (6) in diagram
 
         # a. extract hidden_state
-        hidden_state = image_encodings.last_hidden_state # (scaled_images_num_patches[0], self.seq_len+1, config.hidden_size)
+        hidden_state = (
+            image_encodings.last_hidden_state
+        )  # (scaled_images_num_patches[0], self.seq_len+1, config.hidden_size)
 
         # b. reshape back to image like
         image_features = reshape_feature(
             hidden_state, self.out_size, self.out_size
-        ) # (scaled_images_num_patches[0], config.hidden_size, self.out_size, self.out_size)
+        )  # (scaled_images_num_patches[0], config.hidden_size, self.out_size, self.out_size)
 
         # c. merge patches back together
         image_features = merge(
-            image_features, batch_size=B, merge_out_size=self.out_size*2**(0),
-        ) # (B, config.hidden_size, self.out_size*2**(self.n_scaled_images-1), self.out_size*2**(self.n_scaled_images-1))
+            image_features,
+            batch_size=B,
+            merge_out_size=self.out_size * 2 ** (0),
+        )  # (B, config.hidden_size, self.out_size*2**(self.n_scaled_images-1), self.out_size*2**(self.n_scaled_images-1))
 
         # d. upsample
-        image_features = self.upsample_image(image_features) # (B, config.scaled_images_feature_dims[0], self.out_size*2**1, self.out_size*2**1)
+        image_features = self.upsample_image(
+            image_features
+        )  # (B, config.scaled_images_feature_dims[0], self.out_size*2**1, self.out_size*2**1)
 
         # STEP 7: apply fusion (global_features = image_features + scaled_images_features[0])
         # fuses image_features with lowest resolution features as they are of same size
@@ -1089,37 +1095,49 @@ def forward(
         return encodings
 
 
-# Copied from transformers.models.dpt.modeling_dpt.DPTPreActResidualLayer DPTPreAct->DepthPro
-class DepthProResidualLayer(nn.Module):
+# Copied from transformers.models.dpt.modeling_dpt.DPTPreActResidualLayer DPT->DepthPro
+class DepthProPreActResidualLayer(nn.Module):
+    """
+    ResidualConvUnit, pre-activate residual unit.
+
+    Args:
+        config (`[DepthProConfig]`):
+            Model configuration class defining the model architecture.
+    """
+
     def __init__(self, config):
         super().__init__()
 
-        self.use_batch_norm = config.use_batch_norm_in_fusion
-        self.hidden_size = config.fusion_hidden_size
+        self.use_batch_norm = config.use_batch_norm_in_fusion_residual
+        use_bias_in_fusion_residual = (
+            config.use_bias_in_fusion_residual
+            if config.use_bias_in_fusion_residual is not None
+            else not self.use_batch_norm
+        )
 
         self.activation1 = nn.ReLU()
         self.convolution1 = nn.Conv2d(
-            self.hidden_size,
-            self.hidden_size,
+            config.fusion_hidden_size,
+            config.fusion_hidden_size,
             kernel_size=3,
             stride=1,
             padding=1,
-            bias=(not self.use_batch_norm),
+            bias=use_bias_in_fusion_residual,
         )
 
         self.activation2 = nn.ReLU()
         self.convolution2 = nn.Conv2d(
-            self.hidden_size,
-            self.hidden_size,
+            config.fusion_hidden_size,
+            config.fusion_hidden_size,
             kernel_size=3,
             stride=1,
             padding=1,
-            bias=(not self.use_batch_norm),
+            bias=use_bias_in_fusion_residual,
         )
 
         if self.use_batch_norm:
-            self.batch_norm1 = nn.BatchNorm2d(self.hidden_size)
-            self.batch_norm2 = nn.BatchNorm2d(self.hidden_size)
+            self.batch_norm1 = nn.BatchNorm2d(config.fusion_hidden_size)
+            self.batch_norm2 = nn.BatchNorm2d(config.fusion_hidden_size)
 
     def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
         residual = hidden_state
@@ -1139,15 +1157,16 @@ def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
         return hidden_state + residual
 
 
-# Implementation resembles transformers.models.dpt.modeling_dpt.DPTFeatureFusionLayer
+# Taken from transformers.models.dpt.modeling_dpt.DPTFeatureFusionLayer
+# except it uses deconv, skip_add and avoids interpolation (it always receives consitent inputs)
 class DepthProFeatureFusionLayer(nn.Module):
-    def __init__(self, config: DepthProConfig, use_deconv:bool=True) -> None:
+    def __init__(self, config: DepthProConfig, use_deconv: bool = True) -> None:
         super().__init__()
         self.config = config
         self.use_deconv = use_deconv
 
-        self.residual_layer1 = DepthProResidualLayer(config)
-        self.residual_layer2 = DepthProResidualLayer(config)
+        self.residual_layer1 = DepthProPreActResidualLayer(config)
+        self.residual_layer2 = DepthProPreActResidualLayer(config)
 
         if self.use_deconv:
             self.deconv = nn.ConvTranspose2d(
@@ -1174,13 +1193,14 @@ def forward(self, hidden_state, residual=None):
         return hidden_state
 
 
-# Copied from transformers.models.dpt.modeling_dpt.DPTFeatureFusionStage with DPT->DepthPro with extra layer parameters
+# Take from transformers.models.dpt.modeling_dpt.DPTFeatureFusionStage with DPT->DepthPro
+# with extra layer parameters, deconv and reversed layers
 class DepthProFeatureFusionStage(nn.Module):
     def __init__(self, config, num_layers):
         super().__init__()
         self.num_layers = num_layers
         self.layers = nn.ModuleList()
-        for _ in range(self.num_layers-1):
+        for _ in range(self.num_layers - 1):
             self.layers.append(DepthProFeatureFusionLayer(config))
         # final layer doesnot require deconvolution
         self.layers.append(DepthProFeatureFusionLayer(config, use_deconv=False))
@@ -1214,7 +1234,7 @@ def __init__(self, config: DepthProConfig) -> None:
         self.encoder_neck = nn.Linear(self.hidden_size, self.fusion_hidden_size // 2)
         self.global_neck = nn.Sequential(
             nn.Conv2d(self.fusion_hidden_size, self.fusion_hidden_size // 2, kernel_size=3, stride=2, padding=1),
-            nn.ReLU(True)
+            nn.ReLU(True),
         )
 
         if config.fusion_hidden_size // 2**config.num_fov_head_layers == 0:
@@ -1227,19 +1247,21 @@ def __init__(self, config: DepthProConfig) -> None:
         self.head = nn.Sequential()
         for i in range(config.num_fov_head_layers):
             self.head.append(
-                nn.Conv2d(self.fusion_hidden_size // 2**(i+1), self.fusion_hidden_size // 2**(i+2), kernel_size=3, stride=2, padding=1)
+                nn.Conv2d(
+                    self.fusion_hidden_size // 2 ** (i + 1),
+                    self.fusion_hidden_size // 2 ** (i + 2),
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                )
             )
             self.head.append(nn.ReLU(True))
         # calculate expected shapes to finally generate a scalar output from final head layer
-        final_in_channels = self.fusion_hidden_size // 2**(config.num_fov_head_layers+1)
+        final_in_channels = self.fusion_hidden_size // 2 ** (config.num_fov_head_layers + 1)
         final_kernal_size = int((self.out_size - 1) / 2**config.num_fov_head_layers + 1)
         self.head.append(
             nn.Conv2d(
-                in_channels=final_in_channels,
-                out_channels=1,
-                kernel_size=final_kernal_size,
-                stride=1,
-                padding=0
+                in_channels=final_in_channels, out_channels=1, kernel_size=final_kernal_size, stride=1, padding=0
             )
         )
 
@@ -1263,7 +1285,7 @@ def forward(
         # follow the steps same as with image features in DepthProEncoder
         pixel_values = interpolate(
             pixel_values,
-            scale_factor=self.config.scaled_images_ratios[0], # same ratio as lowest ratioed image
+            scale_factor=self.config.scaled_images_ratios[0],  # same ratio as lowest ratioed image
         )
         patches = patch(
             pixel_values,
@@ -1279,11 +1301,7 @@ def forward(
         )
         last_hidden_state = encoder_outputs[0]
         last_hidden_state = self.encoder_neck(last_hidden_state)
-        last_hidden_state = reshape_feature(
-            last_hidden_state,
-            width=self.out_size,
-            height=self.out_size
-        )
+        last_hidden_state = reshape_feature(last_hidden_state, width=self.out_size, height=self.out_size)
         last_hidden_state = merge(
             last_hidden_state,
             batch_size=B,
@@ -1321,12 +1339,11 @@ def __init__(self, config):
 
         features = config.fusion_hidden_size
         self.head = nn.Sequential(
-            nn.Conv2d(features, features//2, kernel_size=3, stride=1, padding=1),
+            nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1),
             nn.ConvTranspose2d(
-                in_channels=features//2, out_channels=features//2,
-                kernel_size=2, stride=2, padding=0, bias=True
+                in_channels=features // 2, out_channels=features // 2, kernel_size=2, stride=2, padding=0, bias=True
             ),
-            nn.Conv2d(features//2, 32, kernel_size=3, stride=1, padding=1),
+            nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1),
             nn.ReLU(True),
             nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
             nn.ReLU(),
@@ -1347,6 +1364,7 @@ class DepthProDepthEstimatorOutput(DepthEstimatorOutput):
         fov (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `use_fov_model` is provided):
             Field of View Scaler.
     """
+
     fov: Optional[torch.FloatTensor] = None
 
 
@@ -1369,7 +1387,7 @@ def __init__(self, config, use_fov_model=None):
         combined_feature_dims = config.scaled_images_feature_dims + config.intermediate_feature_dims
         self.projections = nn.ModuleList()
         for i, in_channels in enumerate(combined_feature_dims):
-            if i == len(combined_feature_dims)-1 and in_channels == config.fusion_hidden_size:
+            if i == len(combined_feature_dims) - 1 and in_channels == config.fusion_hidden_size:
                 # projection for last layer can be ignored if input and output channels already match
                 self.projections.append(nn.Identity())
             else:
@@ -1397,7 +1415,6 @@ def __init__(self, config, use_fov_model=None):
         # Initialize weights and apply final processing
         self.post_init()
 
-
     @add_start_docstrings_to_model_forward(DEPTH_PRO_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=DepthProDepthEstimatorOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
@@ -1454,7 +1471,9 @@ def forward(
             )
             fov = fov_encodings.last_hidden_state
             attentions = depth_pro_outputs.attentions + fov_encodings.attentions if output_attentions else None
-            hidden_states = depth_pro_outputs.hidden_states + fov_encodings.hidden_states if output_hidden_states else None
+            hidden_states = (
+                depth_pro_outputs.hidden_states + fov_encodings.hidden_states if output_hidden_states else None
+            )
         else:
             fov = None
             attentions = depth_pro_outputs.attentions

From 3c656f24a5e33fed84663f2c0d45053b2b3c4e91 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Thu, 28 Nov 2024 01:29:54 +0500
Subject: [PATCH 035/151] add examples; update weight convert script

---
 .../convert_depth_pro_weights_to_hf.py        |  4 +-
 .../models/depth_pro/modeling_depth_pro.py    | 58 ++++++++++++++++++-
 2 files changed, 58 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
index 377595b746ac..cd06a99c5fb2 100644
--- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
+++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
@@ -168,7 +168,8 @@ def write_model(
         scaled_images_ratios=[0.25, 0.5, 1],
         scaled_images_overlap_ratios=[0.0, 0.5, 0.25],
         scaled_images_feature_dims=[1024, 1024, 512],
-        use_batch_norm_in_fusion=False,
+        use_batch_norm_in_fusion_residual=False,
+        use_bias_in_fusion_residual=True,
         use_fov_model=True,
         num_fov_head_layers=2,
     )
@@ -228,7 +229,6 @@ def write_image_processor(output_dir: str):
         do_normalize=True,
         image_mean=0.5,
         image_std=0.5,
-        return_tensors="pt",
     )
     image_processor.save_pretrained(output_dir)
 
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 77983933a19a..255174de0993 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -1068,8 +1068,34 @@ def forward(
         Returns:
 
         Examples:
-        TODO
+
         ```python
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, DepthProModel
+        >>>
+        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>>
+        >>> checkpoint = "geetu040/DepthPro"
+        >>> processor = AutoProcessor.from_pretrained(checkpoint)
+        >>> model = DepthProModel.from_pretrained(checkpoint)
+        >>>
+        >>> # prepare image for the model
+        >>> inputs = processor(images=image, return_tensors="pt")
+        >>>
+        >>> with torch.no_grad():
+        ...     output = model(**inputs)
+        ...
+        >>> for state in output.last_hidden_state:
+        ...     print(state.shape)
+        ...
+        torch.Size([1, 1024, 48, 48])
+        torch.Size([1, 1024, 96, 96])
+        torch.Size([1, 512, 192, 192])
+        torch.Size([1, 256, 384, 384])
+        torch.Size([1, 256, 768, 768])
         ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -1433,8 +1459,36 @@ def forward(
         Returns:
 
         Examples:
-        TODO
+
         ```python
+        >>> from transformers import AutoImageProcessor, DepthProForDepthEstimation
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+        >>>
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>>
+        >>> checkpoint = "geetu040/DepthPro"
+        >>> processor = AutoImageProcessor.from_pretrained(checkpoint)
+        >>> model = DepthProForDepthEstimation.from_pretrained(checkpoint)
+        >>>
+        >>> # prepare image for the model
+        >>> inputs = processor(images=image, return_tensors="pt")
+        >>>
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+        ...
+        >>> # interpolate to original size
+        >>> post_processed_output = processor.post_process_depth_estimation(
+        ...     outputs.predicted_depth, outputs.fov, target_sizes=[(image.height, image.width)],
+        ... )
+        >>>
+        >>> # visualize the prediction
+        >>> predicted_depth = post_processed_output["predicted_depth"][0]
+        >>> depth = predicted_depth * 255 / predicted_depth.max()
+        >>> depth = depth.detach().cpu().numpy()
+        >>> depth = Image.fromarray(depth.astype("uint8"))
         ```"""
         loss = None
         if labels is not None:

From f6f6d3d130b97519b8f9bf0ae9413301f655ecd9 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Fri, 29 Nov 2024 10:08:56 +0500
Subject: [PATCH 036/151] fix using check_table.py and isort

---
 docs/source/en/index.md                       |  1 +
 src/transformers/__init__.py                  | 18 ++++++++--------
 .../models/auto/configuration_auto.py         |  4 ++--
 .../models/auto/image_processing_auto.py      |  2 +-
 src/transformers/models/auto/modeling_auto.py |  6 +++---
 .../models/gemma/configuration_gemma.py       |  1 -
 src/transformers/utils/dummy_pt_objects.py    | 21 +++++++++++++++++++
 .../utils/dummy_vision_objects.py             | 14 +++++++++++++
 8 files changed, 51 insertions(+), 16 deletions(-)

diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index aaff45ab65df..d316e89ce6f4 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -117,6 +117,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                          [DeiT](model_doc/deit)                          |       ✅        |         ✅         |      ❌      |
 |                        [DePlot](model_doc/deplot)                        |       ✅        |         ❌         |      ❌      |
 |                [Depth Anything](model_doc/depth_anything)                |       ✅        |         ❌         |      ❌      |
+|                     [DepthPro](model_doc/depth_pro)                      |       ✅        |         ❌         |      ❌      |
 |                          [DETA](model_doc/deta)                          |       ✅        |         ❌         |      ❌      |
 |                          [DETR](model_doc/detr)                          |       ✅        |         ❌         |      ❌      |
 |                      [DialoGPT](model_doc/dialogpt)                      |       ✅        |         ✅         |      ✅      |
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 0e6c48762a85..d4ac4b5fd866 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -387,6 +387,7 @@
     "models.deprecated.vit_hybrid": ["ViTHybridConfig"],
     "models.deprecated.xlm_prophetnet": ["XLMProphetNetConfig"],
     "models.depth_anything": ["DepthAnythingConfig"],
+    "models.depth_pro": ["DepthProConfig"],
     "models.detr": ["DetrConfig"],
     "models.dialogpt": [],
     "models.dinat": ["DinatConfig"],
@@ -408,7 +409,6 @@
         "DPRReaderTokenizer",
     ],
     "models.dpt": ["DPTConfig"],
-    "models.depth_pro": ["DepthProConfig"],
     "models.efficientnet": ["EfficientNetConfig"],
     "models.electra": [
         "ElectraConfig",
@@ -1193,10 +1193,10 @@
     _import_structure["models.deprecated.efficientformer"].append("EfficientFormerImageProcessor")
     _import_structure["models.deprecated.tvlt"].append("TvltImageProcessor")
     _import_structure["models.deprecated.vit_hybrid"].extend(["ViTHybridImageProcessor"])
+    _import_structure["models.depth_pro"].extend(["DepthProImageProcessor", "DepthProImageProcessorFast"])
     _import_structure["models.detr"].extend(["DetrFeatureExtractor", "DetrImageProcessor", "DetrImageProcessorFast"])
     _import_structure["models.donut"].extend(["DonutFeatureExtractor", "DonutImageProcessor"])
     _import_structure["models.dpt"].extend(["DPTFeatureExtractor", "DPTImageProcessor"])
-    _import_structure["models.depth_pro"].extend(["DepthProImageProcessor", "DepthProImageProcessorFast"])
     _import_structure["models.efficientnet"].append("EfficientNetImageProcessor")
     _import_structure["models.flava"].extend(["FlavaFeatureExtractor", "FlavaImageProcessor", "FlavaProcessor"])
     _import_structure["models.fuyu"].extend(["FuyuImageProcessor", "FuyuProcessor"])
@@ -2078,6 +2078,13 @@
             "DepthAnythingPreTrainedModel",
         ]
     )
+    _import_structure["models.depth_pro"].extend(
+        [
+            "DepthProForDepthEstimation",
+            "DepthProModel",
+            "DepthProPreTrainedModel",
+        ]
+    )
     _import_structure["models.detr"].extend(
         [
             "DetrForObjectDetection",
@@ -2138,13 +2145,6 @@
             "DPTPreTrainedModel",
         ]
     )
-    _import_structure["models.depth_pro"].extend(
-        [
-            "DepthProForDepthEstimation",
-            "DepthProModel",
-            "DepthProPreTrainedModel",
-        ]
-    )
     _import_structure["models.efficientnet"].extend(
         [
             "EfficientNetForImageClassification",
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index d8860d38f850..a02af514b65a 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -85,6 +85,7 @@
         ("deformable_detr", "DeformableDetrConfig"),
         ("deit", "DeiTConfig"),
         ("depth_anything", "DepthAnythingConfig"),
+        ("depth_pro", "DepthProConfig"),
         ("deta", "DetaConfig"),
         ("detr", "DetrConfig"),
         ("dinat", "DinatConfig"),
@@ -93,7 +94,6 @@
         ("donut-swin", "DonutSwinConfig"),
         ("dpr", "DPRConfig"),
         ("dpt", "DPTConfig"),
-        ("depth_pro", "DepthProConfig"),
         ("efficientformer", "EfficientFormerConfig"),
         ("efficientnet", "EfficientNetConfig"),
         ("electra", "ElectraConfig"),
@@ -385,6 +385,7 @@
         ("deplot", "DePlot"),
         ("depth_anything", "Depth Anything"),
         ("depth_anything_v2", "Depth Anything V2"),
+        ("depth_pro", "DepthPro"),
         ("deta", "DETA"),
         ("detr", "DETR"),
         ("dialogpt", "DialoGPT"),
@@ -395,7 +396,6 @@
         ("donut-swin", "DonutSwin"),
         ("dpr", "DPR"),
         ("dpt", "DPT"),
-        ("depth_pro", "DepthPro"),
         ("efficientformer", "EfficientFormer"),
         ("efficientnet", "EfficientNet"),
         ("electra", "ELECTRA"),
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index e7b53f30a7a0..3887f29415b0 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -71,13 +71,13 @@
             ("deformable_detr", ("DeformableDetrImageProcessor",)),
             ("deit", ("DeiTImageProcessor",)),
             ("depth_anything", ("DPTImageProcessor",)),
+            ("depth_pro", ("DepthProImageProcessor", "DepthProImageProcessorFast")),
             ("deta", ("DetaImageProcessor",)),
             ("detr", ("DetrImageProcessor", "DetrImageProcessorFast")),
             ("dinat", ("ViTImageProcessor", "ViTImageProcessorFast")),
             ("dinov2", ("BitImageProcessor",)),
             ("donut-swin", ("DonutImageProcessor",)),
             ("dpt", ("DPTImageProcessor",)),
-            ("depth_pro", ("DepthProImageProcessor", "DepthProImageProcessorFast")),
             ("efficientformer", ("EfficientFormerImageProcessor",)),
             ("efficientnet", ("EfficientNetImageProcessor",)),
             ("flava", ("FlavaImageProcessor",)),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 4cc15ca4ca51..b8bcd0cbcb00 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -84,6 +84,7 @@
         ("decision_transformer", "DecisionTransformerModel"),
         ("deformable_detr", "DeformableDetrModel"),
         ("deit", "DeiTModel"),
+        ("depth_pro", "DepthProModel"),
         ("deta", "DetaModel"),
         ("detr", "DetrModel"),
         ("dinat", "DinatModel"),
@@ -92,7 +93,6 @@
         ("donut-swin", "DonutSwinModel"),
         ("dpr", "DPRQuestionEncoder"),
         ("dpt", "DPTModel"),
-        ("depth_pro", "DepthProModel"),
         ("efficientformer", "EfficientFormerModel"),
         ("efficientnet", "EfficientNetModel"),
         ("electra", "ElectraModel"),
@@ -567,12 +567,12 @@
         ("data2vec-vision", "Data2VecVisionModel"),
         ("deformable_detr", "DeformableDetrModel"),
         ("deit", "DeiTModel"),
+        ("depth_pro", "DepthProModel"),
         ("deta", "DetaModel"),
         ("detr", "DetrModel"),
         ("dinat", "DinatModel"),
         ("dinov2", "Dinov2Model"),
         ("dpt", "DPTModel"),
-        ("depth_pro", "DepthProModel"),
         ("efficientformer", "EfficientFormerModel"),
         ("efficientnet", "EfficientNetModel"),
         ("focalnet", "FocalNetModel"),
@@ -867,8 +867,8 @@
     [
         # Model for depth estimation mapping
         ("depth_anything", "DepthAnythingForDepthEstimation"),
-        ("dpt", "DPTForDepthEstimation"),
         ("depth_pro", "DepthProForDepthEstimation"),
+        ("dpt", "DPTForDepthEstimation"),
         ("glpn", "GLPNForDepthEstimation"),
         ("zoedepth", "ZoeDepthForDepthEstimation"),
     ]
diff --git a/src/transformers/models/gemma/configuration_gemma.py b/src/transformers/models/gemma/configuration_gemma.py
index e170803cccab..346f386ba698 100644
--- a/src/transformers/models/gemma/configuration_gemma.py
+++ b/src/transformers/models/gemma/configuration_gemma.py
@@ -20,7 +20,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
 from ...configuration_utils import PretrainedConfig
 
 
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 36e1ff2cfe65..dc32f6d653d6 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -3457,6 +3457,27 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class DepthProForDepthEstimation(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class DepthProModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class DepthProPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class DetrForObjectDetection(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index 19cf02a4e858..1ceb9e227bb2 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -177,6 +177,20 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class DepthProImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
+class DepthProImageProcessorFast(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class DetrFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 

From b4575d026de8a8ca69650c76ab3b21f22e860a48 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Fri, 29 Nov 2024 10:45:19 +0500
Subject: [PATCH 037/151] fix config docstring

---
 .../models/depth_pro/configuration_depth_pro.py           | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py
index d938f0a721f1..d48d68b832b4 100644
--- a/src/transformers/models/depth_pro/configuration_depth_pro.py
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@@ -38,7 +38,7 @@ class DepthProConfig(PretrainedConfig):
             The number of channels before fusion.
         num_hidden_layers (`int`, *optional*, defaults to 24):
             Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 16):
             Number of attention heads for each attention layer in the Transformer encoder.
         mlp_ratio (`int`, *optional*, defaults to 4):
             Ratio of the hidden size of the MLPs relative to the `hidden_size`.
@@ -58,7 +58,7 @@ class DepthProConfig(PretrainedConfig):
             To generate depth of same size as image,
             image_size / 2**n_fusion_blocks == patch_size / patch_embeddings_size
             where n_fusion_blocks = len(intermediate_hook_ids) + len(scaled_images_ratios)
-        patch_size (`int`, *optional*, defaults to 14):
+        patch_size (`int`, *optional*, defaults to 384):
             The size (resolution) of each patch.
         num_channels (`int`, *optional*, defaults to 3):
             The number of input channels.
@@ -90,9 +90,11 @@ class DepthProConfig(PretrainedConfig):
             Hidden state dimensions during upsampling for each scaled image in `scaled_images_ratios`.
         use_batch_norm_in_fusion_residual (`bool`, *optional*, defaults to `False`):
             Whether to use batch normalization in the pre-activate residual units of the fusion blocks.
+        use_bias_in_fusion_residual (`bool`, *optional*, defaults to `True`):
+            Whether to use bias in the pre-activate residual units of the fusion blocks.
         use_fov_model (`bool`, *optional*, defaults to `True`):
             Whether to use `DepthProFOVModel` to generate the field of view.
-        num_fov_head_layers (`int`, *optional*, defaults to `2`):
+        num_fov_head_layers (`int`, *optional*, defaults to 2):
             Number of convolution layers in the head of `DepthProFOVModel`.
 
     Example:

From c8d8a9e0ca3750cc062fe9ad3b90fdbe5a893f0b Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Fri, 29 Nov 2024 11:26:12 +0500
Subject: [PATCH 038/151] add depth pro to sdpa docs

---
 docs/source/en/perf_infer_gpu_one.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md
index 67bd31fdaeed..4f1ccc9c427c 100644
--- a/docs/source/en/perf_infer_gpu_one.md
+++ b/docs/source/en/perf_infer_gpu_one.md
@@ -227,6 +227,7 @@ For now, Transformers supports SDPA inference and training for the following arc
 * [data2vec_audio](https://huggingface.co/docs/transformers/main/en/model_doc/data2vec#transformers.Data2VecAudioModel)
 * [Dbrx](https://huggingface.co/docs/transformers/model_doc/dbrx#transformers.DbrxModel)
 * [DeiT](https://huggingface.co/docs/transformers/model_doc/deit#transformers.DeiTModel)
+* [DepthPro](https://huggingface.co/docs/transformers/model_doc/depth_pro#transformers.DepthProModel)
 * [Dinov2](https://huggingface.co/docs/transformers/en/model_doc/dinov2)
 * [DistilBert](https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.DistilBertModel)
 * [Dpr](https://huggingface.co/docs/transformers/model_doc/dpr#transformers.DprReader)

From 77873de8a34447d64d16e1a5def4ba8fb7109bb5 Mon Sep 17 00:00:00 2001
From: Armaghan Shakir <raoarmaghanshakir040@gmail.com>
Date: Fri, 29 Nov 2024 15:30:42 +0500
Subject: [PATCH 039/151] undo unintentional changes in configuration_gemma.py

---
 src/transformers/models/gemma/configuration_gemma.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/gemma/configuration_gemma.py b/src/transformers/models/gemma/configuration_gemma.py
index 346f386ba698..e170803cccab 100644
--- a/src/transformers/models/gemma/configuration_gemma.py
+++ b/src/transformers/models/gemma/configuration_gemma.py
@@ -20,6 +20,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+
 from ...configuration_utils import PretrainedConfig
 
 

From 5f2378d112193317902a733d13b21fc081fc8b56 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sat, 30 Nov 2024 23:51:55 +0500
Subject: [PATCH 040/151] minor fixes

---
 src/transformers/models/__init__.py           |  1 +
 .../depth_pro/image_processing_depth_pro.py   | 24 +++++++++++--------
 .../models/depth_pro/modeling_depth_pro.py    |  7 +-----
 3 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 9155f629e63f..fc26362dd64d 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -69,6 +69,7 @@
     deit,
     deprecated,
     depth_anything,
+    depth_pro,
     detr,
     dialogpt,
     dinat,
diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py
index 15a33f804d14..746f246fcd73 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py
@@ -113,7 +113,7 @@ def __init__(
 
     def resize(
         self,
-        images: List[np.ndarray],
+        image: np.ndarray,
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BILINEAR,
         antialias: bool = False,
@@ -125,8 +125,8 @@ def resize(
         Resize an image to `(size["height"], size["width"])`.
 
         Args:
-            images (`List[np.ndarray]`):
-                Images to resize.
+            image (`np.ndarray`):
+                Image to resize.
             size (`Dict[str, int]`):
                 Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
             resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
@@ -157,16 +157,13 @@ def resize(
             raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
         output_size = (size["height"], size["width"])
 
-        images = np.stack(images)
-        images = torch.from_numpy(images)
-
         return torch.nn.functional.interpolate(
             # input should be (B, C, H, W)
-            input=images,
+            input=torch.from_numpy(image).unsqueeze(0),
             size=output_size,
             mode=pil_torch_interpolation_mapping[resample].value,
             antialias=antialias,
-        )
+        ).squeeze(0).numpy()
 
     def _validate_input_arguments(
         self,
@@ -321,8 +318,15 @@ def preprocess(
         # depth-pro scales the image before resizing it
         # uses torch interpolation which requires ChannelDimension.FIRST
         if do_resize:
-            images = self.resize(images, size=size_dict, resample=resample, antialias=antialias)
-            images = images.numpy()
+            images = [
+                self.resize(
+                    image=image,
+                    size=size,
+                    resample=resample,
+                    antialias=antialias,
+                )
+                for image in images
+            ]
 
         data = {"pixel_values": images}
         return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 255174de0993..16601f9c7c86 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -660,7 +660,7 @@ def merge(patches, batch_size, merge_out_size):
         # patches are not created when scaled image size is equal to patch size
         return patches
 
-    box_size = int(math.sqrt(num_patches // batch_size))
+    box_size = math.ceil(math.sqrt(num_patches // batch_size))
     """
     merge_out_size = (box_size - 2) * (out_size - 2 * padding) + (2) * (out_size - padding)
     padding = (merge_out_size - box_size * out_size) / (6 - 2 * box_size)
@@ -806,11 +806,6 @@ def forward(
 
         B, C, H, W = pixel_values.shape
 
-        if not (H == W == self.config.image_size):
-            raise ValueError(
-                f"Height={H} and Width={W} doesnot match the specified image_size={self.config.image_size} in config."
-            )
-
         if not (C == self.config.num_channels):
             raise ValueError(
                 f"Found {C} channels in image, expected number of channels is {self.config.num_channels} from config."

From d51d0b198824370c47650ca6cc49f403e9c752cc Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sat, 30 Nov 2024 23:57:26 +0500
Subject: [PATCH 041/151] test image processing

---
 .../test_image_processing_depth_pro.py        | 113 ++++++++++++++++++
 1 file changed, 113 insertions(+)
 create mode 100644 tests/models/depth_pro/test_image_processing_depth_pro.py

diff --git a/tests/models/depth_pro/test_image_processing_depth_pro.py b/tests/models/depth_pro/test_image_processing_depth_pro.py
new file mode 100644
index 000000000000..eea9ed01378d
--- /dev/null
+++ b/tests/models/depth_pro/test_image_processing_depth_pro.py
@@ -0,0 +1,113 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers.file_utils import is_vision_available
+from transformers.testing_utils import require_torch, require_vision
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_vision_available():
+    from transformers import DepthProImageProcessor, DepthProImageProcessorFast
+
+
+class DepthProImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+    ):
+        super().__init__()
+        size = size if size is not None else {"height": 18, "width": 18}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+
+    def prepare_image_processor_dict(self):
+        return {
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_normalize": self.do_normalize,
+            "do_resize": self.do_resize,
+            "size": self.size,
+        }
+
+    def expected_output_image_shape(self, images):
+        return self.num_channels, self.size["height"], self.size["width"]
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+class DepthProImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = DepthProImageProcessor if is_vision_available() else None
+    fast_image_processing_class = DepthProImageProcessorFast if is_vision_available() else None
+
+    def setUp(self):
+        super().setUp()
+        self.image_processor_tester = DepthProImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "size"))
+        self.assertTrue(hasattr(image_processing, "do_rescale"))
+        self.assertTrue(hasattr(image_processing, "rescale_factor"))
+        self.assertTrue(hasattr(image_processing, "resample"))
+        self.assertTrue(hasattr(image_processing, "antialias"))
+
+    def test_image_processor_from_dict_with_kwargs(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        self.assertEqual(image_processor.size, {"height": 18, "width": 18})
+
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42)
+        self.assertEqual(image_processor.size, {"height": 42, "width": 42})

From 082b05555df1b7b55335d6790582f47b0e6c4ca1 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Tue, 3 Dec 2024 02:01:42 +0500
Subject: [PATCH 042/151] fixes and tests

---
 docs/source/en/model_doc/depth_pro.md         | 119 +++++++
 .../depth_pro/configuration_depth_pro.py      |   2 +-
 .../depth_pro/image_processing_depth_pro.py   |   1 -
 .../models/depth_pro/modeling_depth_pro.py    | 177 +++++----
 tests/models/depth_pro/__init__.py            |   0
 .../depth_pro/test_modeling_depth_pro.py      | 335 ++++++++++++++++++
 6 files changed, 558 insertions(+), 76 deletions(-)
 create mode 100644 docs/source/en/model_doc/depth_pro.md
 create mode 100644 tests/models/depth_pro/__init__.py
 create mode 100644 tests/models/depth_pro/test_modeling_depth_pro.py

diff --git a/docs/source/en/model_doc/depth_pro.md b/docs/source/en/model_doc/depth_pro.md
new file mode 100644
index 000000000000..6472cc506dae
--- /dev/null
+++ b/docs/source/en/model_doc/depth_pro.md
@@ -0,0 +1,119 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# DepthPro
+
+## Overview
+
+The DepthPro model was proposed in [Depth Pro: Sharp Monocular Metric Depth in Less Than a Second](https://arxiv.org/abs/2410.02073) by Aleksei Bochkovskii, Amaël Delaunoy, Hugo Germain, Marcel Santos, Yichao Zhou, Stephan R. Richter, Vladlen Koltun.
+
+It leverages a multi-scale [Vision Transformer (ViT)](vit) optimized for dense predictions. It downsamples an image at several scales. At each scale, it is split into patches, which are processed by a ViT-based [Dinov2](dinov2) patch encoder, with weights shared across scales. Patches are merged into feature maps, upsampled, and fused via a [DPT](dpt) like decoder.
+
+The abstract from the paper is the following:
+
+*We present a foundation model for zero-shot metric monocular depth estimation. Our model, Depth Pro, synthesizes high-resolution depth maps with unparalleled sharpness and high-frequency details. The predictions are metric, with absolute scale, without relying on the availability of metadata such as camera intrinsics. And the model is fast, producing a 2.25-megapixel depth map in 0.3 seconds on a standard GPU. These characteristics are enabled by a number of technical contributions, including an efficient multi-scale vision transformer for dense prediction, a training protocol that combines real and synthetic datasets to achieve high metric accuracy alongside fine boundary tracing, dedicated evaluation metrics for boundary accuracy in estimated depth maps, and state-of-the-art focal length estimation from a single image. Extensive experiments analyze specific design choices and demonstrate that Depth Pro outperforms prior work along multiple dimensions.*
+
+<img src="https://raw.githubusercontent.com/apple/ml-depth-pro/b2cd0d51daa95e49277a9f642f7fd736b7f9e91d/data/depth-pro-teaser.jpg"
+alt="drawing" width="600"/>
+
+<small> DepthPro architecture. Taken from the <a href="https://arxiv.org/abs/2410.02073" target="_blank">original paper</a>. </small>
+
+This model was contributed by [geetu040](https://github.com/geetu040). The original code can be found [here](https://github.com/apple/ml-depth-pro).
+
+<!-- TODO -->
+
+## Usage tips
+
+```python
+from transformers import Dinov2Config, DepthProConfig, DepthProForDepthEstimation
+
+# initialize with a Transformer-based backbone such as DINOv2
+# in that case, we also specify `reshape_hidden_states=False` to get feature maps of shape (batch_size, num_channels, height, width)
+backbone_config = Dinov2Config.from_pretrained("facebook/dinov2-base", out_features=["stage1", "stage2", "stage3", "stage4"], reshape_hidden_states=False)
+
+config = DepthProConfig(backbone_config=backbone_config)
+model = DepthProForDepthEstimation(config=config)
+```
+
+### Using Scaled Dot Product Attention (SDPA)
+
+PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function 
+encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the 
+[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) 
+or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention)
+page for more information.
+
+SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set 
+`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used.
+
+```
+from transformers import ViTForImageClassification
+model = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224", attn_implementation="sdpa", torch_dtype=torch.float16)
+...
+```
+
+For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`).
+
+On a local benchmark (A100-40GB, PyTorch 2.3.0, OS Ubuntu 22.04) with `float32` and `google/vit-base-patch16-224` model, we saw the following speedups during inference.
+
+|   Batch size |   Average inference time (ms), eager mode |   Average inference time (ms), sdpa model |   Speed up, Sdpa / Eager (x) |
+|--------------|-------------------------------------------|-------------------------------------------|------------------------------|
+|            1 |                                         7 |                                         6 |                      1.17 |
+|            2 |                                         8 |                                         6 |                      1.33 |
+|            4 |                                         8 |                                         6 |                      1.33 |
+|            8 |                                         8 |                                         6 |                      1.33 |
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DepthPro.
+
+- Demo notebooks for [`DepthProForDepthEstimation`] can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/DepthPro).
+
+- [Semantic segmentation task guide](../tasks/semantic_segmentation)
+- [Monocular depth estimation task guide](../tasks/monocular_depth_estimation)
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+## DepthProConfig
+
+[[autodoc]] DepthProConfig
+
+## DepthProFeatureExtractor
+
+[[autodoc]] DepthProFeatureExtractor
+    - __call__
+    - post_process_semantic_segmentation
+
+## DepthProImageProcessor
+
+[[autodoc]] DepthProImageProcessor
+    - preprocess
+    - post_process_semantic_segmentation
+
+## DepthProModel
+
+[[autodoc]] DepthProModel
+    - forward
+
+## DepthProForDepthEstimation
+
+[[autodoc]] DepthProForDepthEstimation
+    - forward
+
+## DepthProForSemanticSegmentation
+
+[[autodoc]] DepthProForSemanticSegmentation
+    - forward
diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py
index d48d68b832b4..beb3215d8ddf 100644
--- a/src/transformers/models/depth_pro/configuration_depth_pro.py
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@@ -56,7 +56,7 @@ class DepthProConfig(PretrainedConfig):
         image_size (`int`, *optional*, defaults to 1536):
             The size (resolution) of each image,
             To generate depth of same size as image,
-            image_size / 2**n_fusion_blocks == patch_size / patch_embeddings_size
+            image_size / 2**(n_fusion_blocks+1) == patch_size / patch_embeddings_size
             where n_fusion_blocks = len(intermediate_hook_ids) + len(scaled_images_ratios)
         patch_size (`int`, *optional*, defaults to 384):
             The size (resolution) of each patch.
diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py
index 746f246fcd73..65a29900c637 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py
@@ -264,7 +264,6 @@ def preprocess(
         image_std = image_std if image_std is not None else self.image_std
 
         size = size if size is not None else self.size
-        size_dict = get_size_dict(size)
 
         images = make_list_of_images(images)
 
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 16601f9c7c86..2e074588d4e3 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -31,6 +31,7 @@
     logging,
     replace_return_docstrings,
     torch_int,
+    ModelOutput,
 )
 from .configuration_depth_pro import DepthProConfig
 
@@ -87,9 +88,9 @@ def __init__(self, config: DepthProConfig) -> None:
         self.config = config
         self.seq_len = (config.patch_size // config.patch_embeddings_size) ** 2
 
-        self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size))
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
         self.patch_embeddings = DepthProViTPatchEmbeddings(config)
-        self.position_embeddings = nn.Parameter(torch.randn(1, self.seq_len + 1, config.hidden_size))
+        self.position_embeddings = nn.Parameter(torch.zeros(1, self.seq_len + 1, config.hidden_size))
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
@@ -698,6 +699,35 @@ def merge(patches, batch_size, merge_out_size):
     return boxes
 
 
+@dataclass
+class DepthProOutput(ModelOutput):
+    """
+    Base class for DepthPro's outputs.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        features (`List[torch.FloatTensor]`, *optional*:
+            Features from scaled images and hidden_states.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    last_hidden_state: torch.FloatTensor = None
+    features: Optional[List[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+
+
 class DepthProEncoder(nn.Module):
     def __init__(self, config: DepthProConfig) -> None:
         super().__init__()
@@ -794,7 +824,7 @@ def forward(
         output_attentions: bool = False,
         output_hidden_states: bool = False,
         return_dict: bool = True,
-    ) -> Union[tuple, BaseModelOutput]:
+    ) -> Union[tuple, DepthProOutput]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -848,8 +878,8 @@ def forward(
         image_encodings = self.image_encoder(
             pixel_values=scaled_images[0],  # provide least resolution image
             head_mask=head_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
+            output_attentions=False,
+            output_hidden_states=False,
             return_dict=True,
         )
 
@@ -941,21 +971,36 @@ def forward(
         scaled_images_features[0] = self.fuse_image_with_low_res(scaled_images_features[0])
 
         # STEP 8: return these features in order of increasing size as what fusion expects
-        last_hidden_state = [
+        features = [
             # (B, self.scaled_images_feature_dims[i], self.out_size*2**(i+1), self.out_size*2**(i+1))
             *scaled_images_features,
             # (B, config.intermediate_feature_dims[i], self.out_size*2**(self.n_scaled_images+i+1), self.out_size*2**(self.n_scaled_images+i+1))
             *intermediate_features,
         ]
 
-        hidden_states = patch_encodings.hidden_states + image_encodings.hidden_states if output_hidden_states else None
-        attentions = patch_encodings.attentions + image_encodings.attentions if output_attentions else None
+        # prepare last_hidden_state, hidden_states, attentions from patches to batches
+
+        last_hidden_state = patch_encodings.last_hidden_state
+        hidden_states = patch_encodings.hidden_states if output_hidden_states else None
+        attentions = patch_encodings.attentions if output_attentions else None
+
+        num_patches = sum(scaled_images_num_patches)
+        # [0, 3, 6], [1, 4, 7], [2, 5, 8] when num_patches=9 and B=3
+        indexes = torch.arange(num_patches).reshape(num_patches//B, -1).T
+        indexes = indexes.to(last_hidden_state.device)
+
+        last_hidden_state = last_hidden_state[indexes].mean(1)
+        if hidden_states is not None:
+            hidden_states = tuple([state[indexes].mean(1) for state in hidden_states])
+        if attentions is not None:
+            attentions = tuple([state[indexes].mean(1) for state in attentions])
 
         if not return_dict:
-            return tuple(v for v in [last_hidden_state, hidden_states, attentions] if v is not None)
+            return tuple(v for v in [last_hidden_state, features, hidden_states, attentions] if v is not None)
 
-        return BaseModelOutput(
+        return DepthProOutput(
             last_hidden_state=last_hidden_state,
+            features=features,
             hidden_states=hidden_states,
             attentions=attentions,
         )
@@ -1034,11 +1079,7 @@ def __init__(self, config):
         self.post_init()
 
     def get_input_embeddings(self):
-        embeddings = {
-            "patch_embeddings": self.encoder.patch_encoder.embeddings.patch_embeddings,
-            "image_embeddings": self.encoder.image_encoder.embeddings.patch_embeddings,
-        }
-        return embeddings
+        return self.encoder.patch_encoder.embeddings.patch_embeddings
 
     def _prune_heads(self, heads_to_prune):
         """
@@ -1058,7 +1099,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutput]:
+    ) -> Union[Tuple, DepthProOutput]:
         r"""
         Returns:
 
@@ -1215,7 +1256,7 @@ def forward(self, hidden_state, residual=None):
 
 
 # Take from transformers.models.dpt.modeling_dpt.DPTFeatureFusionStage with DPT->DepthPro
-# with extra layer parameters, deconv and reversed layers
+# with num_layers, deconv and reversed layers
 class DepthProFeatureFusionStage(nn.Module):
     def __init__(self, config, num_layers):
         super().__init__()
@@ -1269,8 +1310,8 @@ def __init__(self, config: DepthProConfig) -> None:
         for i in range(config.num_fov_head_layers):
             self.head.append(
                 nn.Conv2d(
-                    self.fusion_hidden_size // 2 ** (i + 1),
-                    self.fusion_hidden_size // 2 ** (i + 2),
+                    math.ceil(self.fusion_hidden_size / 2 ** (i + 1)),
+                    math.ceil(self.fusion_hidden_size / 2 ** (i + 2)),
                     kernel_size=3,
                     stride=2,
                     padding=1,
@@ -1278,7 +1319,7 @@ def __init__(self, config: DepthProConfig) -> None:
             )
             self.head.append(nn.ReLU(True))
         # calculate expected shapes to finally generate a scalar output from final head layer
-        final_in_channels = self.fusion_hidden_size // 2 ** (config.num_fov_head_layers + 1)
+        final_in_channels = math.ceil(self.fusion_hidden_size / 2 ** (config.num_fov_head_layers + 1))
         final_kernal_size = int((self.out_size - 1) / 2**config.num_fov_head_layers + 1)
         self.head.append(
             nn.Conv2d(
@@ -1291,16 +1332,7 @@ def forward(
         pixel_values: torch.Tensor,
         global_features: torch.Tensor,
         head_mask: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-        output_hidden_states: bool = False,
-        return_dict: bool = True,
-    ) -> Union[tuple, BaseModelOutput]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
+    ) -> torch.Tensor:
         B, C, W, H = pixel_values.shape
 
         # follow the steps same as with image features in DepthProEncoder
@@ -1316,11 +1348,11 @@ def forward(
         encoder_outputs = self.encoder(
             patches,
             head_mask=head_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
+            output_attentions=False,
+            output_hidden_states=False,
+            return_dict=True,
         )
-        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = encoder_outputs.last_hidden_state
         last_hidden_state = self.encoder_neck(last_hidden_state)
         last_hidden_state = reshape_feature(last_hidden_state, width=self.out_size, height=self.out_size)
         last_hidden_state = merge(
@@ -1335,15 +1367,7 @@ def forward(
         fov_output = self.head(last_hidden_state)
         fov_output = fov_output.reshape(B)
 
-        if not return_dict:
-            head_outputs = (fov_output,)
-            return head_outputs + encoder_outputs[1:]
-
-        return BaseModelOutput(
-            last_hidden_state=fov_output,
-            hidden_states=encoder_outputs.hidden_states,
-            attentions=encoder_outputs.attentions,
-        )
+        return fov_output
 
 
 class DepthProDepthEstimationHead(nn.Module):
@@ -1377,16 +1401,36 @@ def forward(self, hidden_states: List[torch.Tensor]) -> torch.Tensor:
 
 
 @dataclass
-class DepthProDepthEstimatorOutput(DepthEstimatorOutput):
+class DepthProDepthEstimatorOutput(ModelOutput):
     """
-    Base class for outputs of DepthProDepthEstimator.
+    Base class for DepthProForDepthEstimation's output.
 
     Args:
-        fov (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `use_fov_model` is provided):
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Classification (or regression if config.num_labels==1) loss.
+        predicted_depth (`torch.FloatTensor` of shape `(batch_size, height, width)`):
+            Predicted depth for each pixel.
+        fov (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned when `use_fov_model` is provided):
             Field of View Scaler.
+
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, num_channels, height, width)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
     """
 
+    loss: Optional[torch.FloatTensor] = None
+    predicted_depth: torch.FloatTensor = None
     fov: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 
 
 @add_start_docstrings(
@@ -1502,41 +1546,26 @@ def forward(
             output_hidden_states=output_hidden_states,
             return_dict=True,
         )
-        last_hidden_state = depth_pro_outputs.last_hidden_state
-        last_hidden_state = [proj(state) for proj, state in zip(self.projections, last_hidden_state)]
-        fused_state = self.fusion_stage(last_hidden_state)
-        predicted_depth = self.head(fused_state)
+        features = depth_pro_outputs.features
+        features = [proj(feature) for proj, feature in zip(self.projections, features)]
+        fused_features = self.fusion_stage(features)
+        predicted_depth = self.head(fused_features)
 
-        if self.use_fov_model:
+        fov = self.fov_model(
+            pixel_values=pixel_values,
             # use lowest scaled image features for fov model
-            global_features = last_hidden_state[0].detach()
-            fov_encodings = self.fov_model(
-                pixel_values=pixel_values,
-                global_features=global_features,
-                head_mask=head_mask,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict=True,
-            )
-            fov = fov_encodings.last_hidden_state
-            attentions = depth_pro_outputs.attentions + fov_encodings.attentions if output_attentions else None
-            hidden_states = (
-                depth_pro_outputs.hidden_states + fov_encodings.hidden_states if output_hidden_states else None
-            )
-        else:
-            fov = None
-            attentions = depth_pro_outputs.attentions
-            hidden_states = depth_pro_outputs.hidden_states
+            global_features=features[0].detach(),
+            head_mask=head_mask,
+        ) if self.use_fov_model else None
 
         if not return_dict:
-            outputs = (predicted_depth, fov, hidden_states, attentions)
-            outputs = (i for i in outputs if i is not None)
-            return outputs
+            outputs = [loss, predicted_depth, fov, depth_pro_outputs.hidden_states, depth_pro_outputs.attentions]
+            return tuple(v for v in outputs if v is not None)
 
         return DepthProDepthEstimatorOutput(
             loss=loss,
             predicted_depth=predicted_depth,
             fov=fov,
-            hidden_states=hidden_states,
-            attentions=attentions,
+            hidden_states=depth_pro_outputs.hidden_states,
+            attentions=depth_pro_outputs.attentions,
         )
diff --git a/tests/models/depth_pro/__init__.py b/tests/models/depth_pro/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py
new file mode 100644
index 000000000000..3d37965dcd1b
--- /dev/null
+++ b/tests/models/depth_pro/test_modeling_depth_pro.py
@@ -0,0 +1,335 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch DepthPro model."""
+
+import unittest
+
+from transformers import DepthProConfig
+from transformers.file_utils import is_torch_available, is_vision_available
+from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+    from transformers import DepthProForDepthEstimation, DepthProModel
+    from transformers.models.auto.modeling_auto import MODEL_MAPPING_NAMES
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import DepthProImageProcessor
+
+
+class DepthProModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=8,
+        image_size=64,
+        patch_size=8,
+        patch_embeddings_size=4,
+        num_channels=3,
+        is_training=True,
+        use_labels=True,
+        hidden_size=32,
+        fusion_hidden_size=16,
+        intermediate_hook_ids=[1, 0],
+        intermediate_feature_dims=[8, 8],
+        scaled_images_ratios=[0.5, 1.0],
+        scaled_images_overlap_ratios=[0.0, 0.2],
+        scaled_images_feature_dims=[12, 12],
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        initializer_range=0.02,
+        use_fov_model=True,
+        num_labels=3,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.patch_embeddings_size = patch_embeddings_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.hidden_size = hidden_size
+        self.fusion_hidden_size = fusion_hidden_size
+        self.intermediate_hook_ids = intermediate_hook_ids
+        self.intermediate_feature_dims = intermediate_feature_dims
+        self.scaled_images_ratios = scaled_images_ratios
+        self.scaled_images_overlap_ratios = scaled_images_overlap_ratios
+        self.scaled_images_feature_dims = scaled_images_feature_dims
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.use_fov_model = use_fov_model
+        self.num_labels = num_labels
+
+        self.num_patches = (patch_size // patch_embeddings_size) ** 2
+        self.seq_length = (patch_size // patch_embeddings_size) ** 2 + 1 # we add 1 for the [CLS] token
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+
+        labels = None
+        if self.use_labels:
+            labels = ids_tensor([self.batch_size, self.image_size, self.image_size], self.num_labels)
+
+        config = self.get_config()
+
+        return config, pixel_values, labels
+
+    def get_config(self):
+        return DepthProConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            patch_embeddings_size=self.patch_embeddings_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            fusion_hidden_size=self.fusion_hidden_size,
+            intermediate_hook_ids=self.intermediate_hook_ids,
+            intermediate_feature_dims=self.intermediate_feature_dims,
+            scaled_images_ratios=self.scaled_images_ratios,
+            scaled_images_overlap_ratios=self.scaled_images_overlap_ratios,
+            scaled_images_feature_dims=self.scaled_images_feature_dims,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            initializer_range=self.initializer_range,
+            use_fov_model=self.use_fov_model,
+        )
+
+    def create_and_check_model(self, config, pixel_values, labels):
+        model = DepthProModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_for_depth_estimation(self, config, pixel_values, labels):
+        config.num_labels = self.num_labels
+        model = DepthProForDepthEstimation(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+        self.parent.assertEqual(result.predicted_depth.shape, (self.batch_size, self.image_size, self.image_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values, labels = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class DepthProModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as DepthPro does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (DepthProModel, DepthProForDepthEstimation) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "depth-estimation": DepthProForDepthEstimation,
+            "image-feature-extraction": DepthProModel,
+        }
+        if is_torch_available()
+        else {}
+    )
+
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+
+    def setUp(self):
+        self.model_tester = DepthProModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=DepthProConfig, has_text_modality=False, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @unittest.skip(reason="DepthPro does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    def test_model_get_set_embeddings(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_for_depth_estimation(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_depth_estimation(*config_and_inputs)
+
+    def test_training(self):
+        for model_class in self.all_model_classes:
+            if model_class.__name__ == "DepthProForDepthEstimation":
+                continue
+
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            config.return_dict = True
+
+            if model_class.__name__ in MODEL_MAPPING_NAMES.values():
+                continue
+
+            model = model_class(config)
+            model.to(torch_device)
+            model.train()
+            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            loss = model(**inputs).loss
+            loss.backward()
+
+    def test_training_gradient_checkpointing(self):
+        for model_class in self.all_model_classes:
+            if model_class.__name__ == "DepthProForDepthEstimation":
+                continue
+
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            config.use_cache = False
+            config.return_dict = True
+
+            if model_class.__name__ in MODEL_MAPPING_NAMES.values() or not model_class.supports_gradient_checkpointing:
+                continue
+            model = model_class(config)
+            model.to(torch_device)
+            model.gradient_checkpointing_enable()
+            model.train()
+            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            loss = model(**inputs).loss
+            loss.backward()
+
+    @unittest.skip(
+        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant(self):
+        pass
+
+    @unittest.skip(
+        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant_false(self):
+        pass
+
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            # Skip the check for the backbone
+            backbone_params = []
+            for name, module in model.named_modules():
+                if module.__class__.__name__ == "DepthProViTHybridEmbeddings":
+                    backbone_params = [f"{name}.{key}" for key in module.state_dict().keys()]
+                    break
+
+            for name, param in model.named_parameters():
+                if param.requires_grad:
+                    if name in backbone_params:
+                        continue
+                    self.assertIn(
+                        ((param.data.mean() * 1e9).round() / 1e9).item(),
+                        [0.0, 1.0],
+                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                    )
+
+    @slow
+    def test_model_from_pretrained(self):
+        model_name = "Intel/depth_pro-large"
+        model = DepthProModel.from_pretrained(model_name)
+        self.assertIsNotNone(model)
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    return image
+
+
+@require_torch
+@require_vision
+@slow
+class DepthProModelIntegrationTest(unittest.TestCase):
+    def test_inference_depth_estimation(self):
+        image_processor = DepthProImageProcessor.from_pretrained("Intel/depth_pro-large")
+        model = DepthProForDepthEstimation.from_pretrained("Intel/depth_pro-large").to(torch_device)
+
+        image = prepare_img()
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+            predicted_depth = outputs.predicted_depth
+
+        # verify the predicted depth
+        expected_shape = torch.Size((1, 384, 384))
+        self.assertEqual(predicted_depth.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[6.3199, 6.3629, 6.4148], [6.3850, 6.3615, 6.4166], [6.3519, 6.3176, 6.3575]]
+        ).to(torch_device)
+
+        self.assertTrue(torch.allclose(outputs.predicted_depth[0, :3, :3], expected_slice, atol=1e-4))
+
+    def test_post_processing_depth_estimation(self):
+        image_processor = DepthProImageProcessor.from_pretrained("Intel/depth_pro-large")
+        model = DepthProForDepthEstimation.from_pretrained("Intel/depth_pro-large")
+
+        image = prepare_img()
+        inputs = image_processor(images=image, return_tensors="pt")
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        predicted_depth = image_processor.post_process_depth_estimation(outputs=outputs)[0]["predicted_depth"]
+        expected_shape = torch.Size((384, 384))
+        self.assertTrue(predicted_depth.shape == expected_shape)
+
+        predicted_depth_l = image_processor.post_process_depth_estimation(outputs=outputs, target_sizes=[(500, 500)])
+        predicted_depth_l = predicted_depth_l[0]["predicted_depth"]
+        expected_shape = torch.Size((500, 500))
+        self.assertTrue(predicted_depth_l.shape == expected_shape)
+
+        output_enlarged = torch.nn.functional.interpolate(
+            predicted_depth.unsqueeze(0).unsqueeze(1), size=(500, 500), mode="bicubic", align_corners=False
+        ).squeeze()
+        self.assertTrue(output_enlarged.shape == expected_shape)
+        self.assertTrue(torch.allclose(predicted_depth_l, output_enlarged, rtol=1e-3))

From 16a39178307e3d2b484fb0df44e3ff05e0b67aff Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Tue, 3 Dec 2024 02:20:22 +0500
Subject: [PATCH 043/151] more fixes

---
 docs/source/en/model_doc/depth_pro.md         | 19 +++++++------------
 .../depth_pro/configuration_depth_pro.py      | 10 ----------
 .../models/depth_pro/modeling_depth_pro.py    |  4 ++--
 3 files changed, 9 insertions(+), 24 deletions(-)

diff --git a/docs/source/en/model_doc/depth_pro.md b/docs/source/en/model_doc/depth_pro.md
index 6472cc506dae..7e4ac13f1d64 100644
--- a/docs/source/en/model_doc/depth_pro.md
+++ b/docs/source/en/model_doc/depth_pro.md
@@ -91,17 +91,17 @@ If you're interested in submitting a resource to be included here, please feel f
 
 [[autodoc]] DepthProConfig
 
-## DepthProFeatureExtractor
-
-[[autodoc]] DepthProFeatureExtractor
-    - __call__
-    - post_process_semantic_segmentation
-
 ## DepthProImageProcessor
 
 [[autodoc]] DepthProImageProcessor
     - preprocess
-    - post_process_semantic_segmentation
+    - post_process_depth_estimation
+
+## DepthProImageProcessorFast
+
+[[autodoc]] DepthProImageProcessorFast
+    - preprocess
+    - post_process_depth_estimation
 
 ## DepthProModel
 
@@ -112,8 +112,3 @@ If you're interested in submitting a resource to be included here, please feel f
 
 [[autodoc]] DepthProForDepthEstimation
     - forward
-
-## DepthProForSemanticSegmentation
-
-[[autodoc]] DepthProForSemanticSegmentation
-    - forward
diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py
index beb3215d8ddf..46220a0731e6 100644
--- a/src/transformers/models/depth_pro/configuration_depth_pro.py
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@@ -72,12 +72,6 @@ class DepthProConfig(PretrainedConfig):
             Stochastic depth rate per sample (when applied in the main path of residual layers).
         use_swiglu_ffn (`bool`, *optional*, defaults to `False`):
             Whether to use the SwiGLU feedforward neural network.
-        apply_layernorm (`bool`, *optional*, defaults to `True`):
-            Whether to apply layer normalization to the feature maps in case the model is used as backbone.
-        reshape_hidden_states (`bool`, *optional*, defaults to `True`):
-            Whether to reshape the feature maps to 4D tensors of shape `(batch_size, hidden_size, height, width)` in
-            case the model is used as backbone. If `False`, the feature maps will be 3D tensors of shape `(batch_size,
-            seq_len, hidden_size)`.
         intermediate_hook_ids (`List[int]`, *optional*, defaults to `[11, 5]`):
             Indices of the intermediate hidden states from the patch encoder to use for fusion.
         intermediate_feature_dims (`List[int]`, *optional*, defaults to `[256, 256]`):
@@ -134,8 +128,6 @@ def __init__(
         layerscale_value=1.0,
         drop_path_rate=0.0,
         use_swiglu_ffn=False,
-        apply_layernorm=True,
-        reshape_hidden_states=True,
         intermediate_hook_ids=[11, 5],
         intermediate_feature_dims=[256, 256],
         scaled_images_ratios=[0.25, 0.5, 1],
@@ -167,8 +159,6 @@ def __init__(
         self.layerscale_value = layerscale_value
         self.drop_path_rate = drop_path_rate
         self.use_swiglu_ffn = use_swiglu_ffn
-        self.apply_layernorm = apply_layernorm
-        self.reshape_hidden_states = reshape_hidden_states
         self.use_batch_norm_in_fusion_residual = use_batch_norm_in_fusion_residual
         self.use_bias_in_fusion_residual = use_bias_in_fusion_residual
         self.use_fov_model = use_fov_model
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 2e074588d4e3..27754c5dbafc 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -22,16 +22,16 @@
 from torch import nn
 
 from ...activations import ACT2FN
-from ...modeling_outputs import BaseModelOutput, DepthEstimatorOutput
+from ...modeling_outputs import BaseModelOutput
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
+    ModelOutput,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     logging,
     replace_return_docstrings,
     torch_int,
-    ModelOutput,
 )
 from .configuration_depth_pro import DepthProConfig
 

From 2408ec54e4f27d2abbecdb8374e58f34d91d8e96 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Tue, 3 Dec 2024 12:18:09 +0500
Subject: [PATCH 044/151] use output states from image_encoder instead

---
 .../models/depth_pro/modeling_depth_pro.py    | 49 ++++++++-----------
 1 file changed, 21 insertions(+), 28 deletions(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 27754c5dbafc..00241bb86465 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -103,7 +103,7 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width:
         - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
         """
 
-        num_positions = self.position_embeddings.shape[1] - 1
+        num_positions = embeddings.shape[1] - 1
 
         # always interpolate when tracing to ensure the exported model works for dynamic input shapes
         if not torch.jit.is_tracing() and self.seq_len == num_positions and height == width:
@@ -117,8 +117,8 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width:
         new_height = height // self.config.patch_embeddings_size
         new_width = width // self.config.patch_embeddings_size
 
-        sqrt_num_positions = torch_int(num_positions**0.5)
-        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
+        patch_pos_embed_size = torch_int(patch_pos_embed.shape[1] ** 0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, patch_pos_embed_size, patch_pos_embed_size, dim)
         patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
         target_dtype = patch_pos_embed.dtype
 
@@ -734,6 +734,7 @@ def __init__(self, config: DepthProConfig) -> None:
         self.config = config
         self.hidden_size = config.hidden_size
         self.fusion_hidden_size = config.fusion_hidden_size
+        self.patch_size = config.patch_size
 
         self.intermediate_hook_ids = config.intermediate_hook_ids
         self.intermediate_feature_dims = config.intermediate_feature_dims
@@ -867,7 +868,7 @@ def forward(
         patch_encodings = self.patch_encoder(
             patches,
             head_mask=head_mask,
-            output_attentions=output_attentions,
+            output_attentions=False,
             output_hidden_states=True,  # required for intermediate features
             return_dict=True,
         )
@@ -875,11 +876,18 @@ def forward(
             patch_encodings.last_hidden_state, scaled_images_num_patches[::-1]
         )[::-1]  # -1 as patch encoder expects high res patches first
 
+        # scale the image to patch size for image_encoder
+        scaled_image_to_patch_size = nn.functional.interpolate(
+            pixel_values,
+            size=(self.patch_size, self.patch_size),
+            mode="bilinear",
+            align_corners=False,
+        )
         image_encodings = self.image_encoder(
-            pixel_values=scaled_images[0],  # provide least resolution image
+            pixel_values=scaled_image_to_patch_size,
             head_mask=head_mask,
-            output_attentions=False,
-            output_hidden_states=False,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
             return_dict=True,
         )
 
@@ -946,19 +954,15 @@ def forward(
         # a. extract hidden_state
         hidden_state = (
             image_encodings.last_hidden_state
-        )  # (scaled_images_num_patches[0], self.seq_len+1, config.hidden_size)
+        )  # (B, self.seq_len+1, config.hidden_size)
 
         # b. reshape back to image like
         image_features = reshape_feature(
             hidden_state, self.out_size, self.out_size
-        )  # (scaled_images_num_patches[0], config.hidden_size, self.out_size, self.out_size)
+        )  # (B, config.hidden_size, self.out_size, self.out_size)
 
         # c. merge patches back together
-        image_features = merge(
-            image_features,
-            batch_size=B,
-            merge_out_size=self.out_size * 2 ** (0),
-        )  # (B, config.hidden_size, self.out_size*2**(self.n_scaled_images-1), self.out_size*2**(self.n_scaled_images-1))
+        # no merge required for image_features as they are already in batches instead of patches
 
         # d. upsample
         image_features = self.upsample_image(
@@ -980,20 +984,9 @@ def forward(
 
         # prepare last_hidden_state, hidden_states, attentions from patches to batches
 
-        last_hidden_state = patch_encodings.last_hidden_state
-        hidden_states = patch_encodings.hidden_states if output_hidden_states else None
-        attentions = patch_encodings.attentions if output_attentions else None
-
-        num_patches = sum(scaled_images_num_patches)
-        # [0, 3, 6], [1, 4, 7], [2, 5, 8] when num_patches=9 and B=3
-        indexes = torch.arange(num_patches).reshape(num_patches//B, -1).T
-        indexes = indexes.to(last_hidden_state.device)
-
-        last_hidden_state = last_hidden_state[indexes].mean(1)
-        if hidden_states is not None:
-            hidden_states = tuple([state[indexes].mean(1) for state in hidden_states])
-        if attentions is not None:
-            attentions = tuple([state[indexes].mean(1) for state in attentions])
+        last_hidden_state = image_encodings.last_hidden_state
+        hidden_states = image_encodings.hidden_states if output_hidden_states else None
+        attentions = image_encodings.attentions if output_attentions else None
 
         if not return_dict:
             return tuple(v for v in [last_hidden_state, features, hidden_states, attentions] if v is not None)

From be0c2a37478589c31d5b3864f16b955f952b43cd Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 4 Dec 2024 09:13:25 +0500
Subject: [PATCH 045/151] Revert "use output states from image_encoder instead"

This reverts commit 2408ec54e4f27d2abbecdb8374e58f34d91d8e96.
---
 .../models/depth_pro/modeling_depth_pro.py    | 49 +++++++++++--------
 1 file changed, 28 insertions(+), 21 deletions(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 00241bb86465..27754c5dbafc 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -103,7 +103,7 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width:
         - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
         """
 
-        num_positions = embeddings.shape[1] - 1
+        num_positions = self.position_embeddings.shape[1] - 1
 
         # always interpolate when tracing to ensure the exported model works for dynamic input shapes
         if not torch.jit.is_tracing() and self.seq_len == num_positions and height == width:
@@ -117,8 +117,8 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width:
         new_height = height // self.config.patch_embeddings_size
         new_width = width // self.config.patch_embeddings_size
 
-        patch_pos_embed_size = torch_int(patch_pos_embed.shape[1] ** 0.5)
-        patch_pos_embed = patch_pos_embed.reshape(1, patch_pos_embed_size, patch_pos_embed_size, dim)
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
         patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
         target_dtype = patch_pos_embed.dtype
 
@@ -734,7 +734,6 @@ def __init__(self, config: DepthProConfig) -> None:
         self.config = config
         self.hidden_size = config.hidden_size
         self.fusion_hidden_size = config.fusion_hidden_size
-        self.patch_size = config.patch_size
 
         self.intermediate_hook_ids = config.intermediate_hook_ids
         self.intermediate_feature_dims = config.intermediate_feature_dims
@@ -868,7 +867,7 @@ def forward(
         patch_encodings = self.patch_encoder(
             patches,
             head_mask=head_mask,
-            output_attentions=False,
+            output_attentions=output_attentions,
             output_hidden_states=True,  # required for intermediate features
             return_dict=True,
         )
@@ -876,18 +875,11 @@ def forward(
             patch_encodings.last_hidden_state, scaled_images_num_patches[::-1]
         )[::-1]  # -1 as patch encoder expects high res patches first
 
-        # scale the image to patch size for image_encoder
-        scaled_image_to_patch_size = nn.functional.interpolate(
-            pixel_values,
-            size=(self.patch_size, self.patch_size),
-            mode="bilinear",
-            align_corners=False,
-        )
         image_encodings = self.image_encoder(
-            pixel_values=scaled_image_to_patch_size,
+            pixel_values=scaled_images[0],  # provide least resolution image
             head_mask=head_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
+            output_attentions=False,
+            output_hidden_states=False,
             return_dict=True,
         )
 
@@ -954,15 +946,19 @@ def forward(
         # a. extract hidden_state
         hidden_state = (
             image_encodings.last_hidden_state
-        )  # (B, self.seq_len+1, config.hidden_size)
+        )  # (scaled_images_num_patches[0], self.seq_len+1, config.hidden_size)
 
         # b. reshape back to image like
         image_features = reshape_feature(
             hidden_state, self.out_size, self.out_size
-        )  # (B, config.hidden_size, self.out_size, self.out_size)
+        )  # (scaled_images_num_patches[0], config.hidden_size, self.out_size, self.out_size)
 
         # c. merge patches back together
-        # no merge required for image_features as they are already in batches instead of patches
+        image_features = merge(
+            image_features,
+            batch_size=B,
+            merge_out_size=self.out_size * 2 ** (0),
+        )  # (B, config.hidden_size, self.out_size*2**(self.n_scaled_images-1), self.out_size*2**(self.n_scaled_images-1))
 
         # d. upsample
         image_features = self.upsample_image(
@@ -984,9 +980,20 @@ def forward(
 
         # prepare last_hidden_state, hidden_states, attentions from patches to batches
 
-        last_hidden_state = image_encodings.last_hidden_state
-        hidden_states = image_encodings.hidden_states if output_hidden_states else None
-        attentions = image_encodings.attentions if output_attentions else None
+        last_hidden_state = patch_encodings.last_hidden_state
+        hidden_states = patch_encodings.hidden_states if output_hidden_states else None
+        attentions = patch_encodings.attentions if output_attentions else None
+
+        num_patches = sum(scaled_images_num_patches)
+        # [0, 3, 6], [1, 4, 7], [2, 5, 8] when num_patches=9 and B=3
+        indexes = torch.arange(num_patches).reshape(num_patches//B, -1).T
+        indexes = indexes.to(last_hidden_state.device)
+
+        last_hidden_state = last_hidden_state[indexes].mean(1)
+        if hidden_states is not None:
+            hidden_states = tuple([state[indexes].mean(1) for state in hidden_states])
+        if attentions is not None:
+            attentions = tuple([state[indexes].mean(1) for state in attentions])
 
         if not return_dict:
             return tuple(v for v in [last_hidden_state, features, hidden_states, attentions] if v is not None)

From efed39f86e629a56df892f45dcbb5d4dc05222a4 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 4 Dec 2024 09:18:16 +0500
Subject: [PATCH 046/151] make embeddings dynamic

---
 src/transformers/models/depth_pro/modeling_depth_pro.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 27754c5dbafc..4f97f37230cb 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -80,6 +80,7 @@ class DepthProViTEmbeddings(nn.Module):
     """
     Copied from transformers.models.dinov2.modeling_dinov2.Dinov2Embeddings
     except antialias=True in interpolation and removal of mask_token
+    and enabling dynamic embeddings.
     """
 
     def __init__(self, config: DepthProConfig) -> None:
@@ -103,7 +104,7 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width:
         - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
         """
 
-        num_positions = self.position_embeddings.shape[1] - 1
+        num_positions = embeddings.shape[1] - 1
 
         # always interpolate when tracing to ensure the exported model works for dynamic input shapes
         if not torch.jit.is_tracing() and self.seq_len == num_positions and height == width:
@@ -117,8 +118,8 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width:
         new_height = height // self.config.patch_embeddings_size
         new_width = width // self.config.patch_embeddings_size
 
-        sqrt_num_positions = torch_int(num_positions**0.5)
-        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
+        patch_pos_embed_size = torch_int(patch_pos_embed.shape[1] ** 0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, patch_pos_embed_size, patch_pos_embed_size, dim)
         patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
         target_dtype = patch_pos_embed.dtype
 

From c3b14fbcc54a1877bf6ebb7b7b61d9d67f1753ce Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 4 Dec 2024 10:58:45 +0500
Subject: [PATCH 047/151] reshape output hidden states and attentions as part
 of computation graph

---
 .../models/depth_pro/modeling_depth_pro.py    | 114 +++++++++++++-----
 .../depth_pro/test_modeling_depth_pro.py      |   3 +-
 2 files changed, 88 insertions(+), 29 deletions(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 4f97f37230cb..6f20838375cf 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -42,6 +42,25 @@
 _CONFIG_FOR_DOC = "DepthProConfig"
 
 
+def patch_to_batch(data: torch.Tensor, batch_size: int) -> torch.Tensor:
+    """
+    converts tensor from shape:
+    (num_patches, seq_len, hidden_size) -> (batch_size, num_patches_per_batch, seq_len, hidden_size)
+    """
+    data = data.reshape(-1, batch_size, *data.shape[1:])
+    data = data.transpose(0, 1)
+    return data
+
+def batch_to_patch(data: torch.Tensor) -> torch.Tensor:
+    """
+    converts tensor from shape:
+    (batch_size, num_patches_per_batch, seq_len, hidden_size) -> (num_patches, seq_len, hidden_size)
+    """
+    data = data.transpose(0, 1)
+    data = data.reshape(-1, *data.shape[2:])
+    return data
+
+
 class DepthProViTPatchEmbeddings(nn.Module):
     """
     Copied from transformers.models.dinov2.modeling_dinov2.Dinov2PatchEmbeddings
@@ -135,13 +154,17 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width:
 
         return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
 
-    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
-        batch_size, _, height, width = pixel_values.shape
+    def forward(
+            self,
+            pixel_values: torch.Tensor,
+            batch_size: Optional[int] = None,
+        ) -> torch.Tensor:
+        n, _, height, width = pixel_values.shape
         target_dtype = self.patch_embeddings.projection.weight.dtype
         embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype))
 
         # add the [CLS] token to the embedded patch tokens
-        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        cls_tokens = self.cls_token.expand(n, -1, -1)
         embeddings = torch.cat((cls_tokens, embeddings), dim=1)
 
         # add positional encoding to each token
@@ -149,11 +172,14 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
 
         embeddings = self.dropout(embeddings)
 
+        if batch_size is not None:
+            embeddings = patch_to_batch(embeddings, batch_size)
+
         return embeddings
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->DepthPro
 class DepthProViTSelfAttention(nn.Module):
+    # Copied from transformers.models.vit.modeling_vit.ViTSelfAttention.__init__ with ViT->DepthPro
     def __init__(self, config: DepthProConfig) -> None:
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
@@ -172,13 +198,20 @@ def __init__(self, config: DepthProConfig) -> None:
 
         self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
 
+    # Copied from transformers.models.vit.modeling_vit.ViTSelfAttention.transpose_for_scores with ViT->DepthPro
     def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
         new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
         x = x.view(new_x_shape)
         return x.permute(0, 2, 1, 3)
 
+    # Taken from transformers.models.vit.modeling_vit.ViTSelfAttention.forward with ViT->DepthPro
+    # with the addition of `batch_size`
     def forward(
-        self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
+        self,
+        hidden_states,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        batch_size: Optional[int] = None,
     ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
         mixed_query_layer = self.query(hidden_states)
 
@@ -202,25 +235,37 @@ def forward(
         if head_mask is not None:
             attention_probs = attention_probs * head_mask
 
-        context_layer = torch.matmul(attention_probs, value_layer)
+        if batch_size is not None:
+            attention_probs_batched = patch_to_batch(attention_probs, batch_size)
+            attention_probs_patched = batch_to_patch(attention_probs_batched)
+        else:
+            attention_probs_patched = attention_probs_batched = attention_probs
+
+        context_layer = torch.matmul(attention_probs_patched, value_layer)
 
         context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
         new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
         context_layer = context_layer.view(new_context_layer_shape)
 
-        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+        outputs = (context_layer, attention_probs_batched) if output_attentions else (context_layer,)
 
         return outputs
 
 
-# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2SdpaSelfAttention with Dinov2Config->DepthProConfig, Dinov2->DepthProViT
 class DepthProViTSdpaSelfAttention(DepthProViTSelfAttention):
+    # Copied from transformers.models.dinov2.modeling_dinov2.Dinov2SdpaSelfAttention.__init__ with Dinov2Config->DepthProConfig, Dinov2->DepthProViT
     def __init__(self, config: DepthProConfig) -> None:
         super().__init__(config)
         self.attention_probs_dropout_prob = config.attention_probs_dropout_prob
 
+    # Taken from transformers.models.dinov2.modeling_dinov2.Dinov2SdpaSelfAttention.forward with Dinov2Config->DepthProConfig, Dinov2->DepthProViT
+    # with the addition of `batch_size`
     def forward(
-        self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
+        self,
+        hidden_states,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        batch_size: Optional[int] = None,
     ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
         if output_attentions:
             # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
@@ -229,7 +274,7 @@ def forward(
                 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
             )
             return super().forward(
-                hidden_states=hidden_states, head_mask=head_mask, output_attentions=output_attentions
+                hidden_states=hidden_states, head_mask=head_mask, output_attentions=output_attentions, batch_size=batch_size,
             )
 
         mixed_query_layer = self.query(hidden_states)
@@ -274,14 +319,15 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         return hidden_states
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViTConfig->DepthProConfig, ViT->DepthProViT
 class DepthProViTAttention(nn.Module):
+    # Copied from transformers.models.vit.modeling_vit.ViTAttention.__init__ with ViTConfig->DepthProConfig, ViT->DepthProViT
     def __init__(self, config: DepthProConfig) -> None:
         super().__init__()
         self.attention = DepthProViTSelfAttention(config)
         self.output = DepthProViTSelfOutput(config)
         self.pruned_heads = set()
 
+    # Copied from transformers.models.vit.modeling_vit.ViTAttention.prune_heads
     def prune_heads(self, heads: Set[int]) -> None:
         if len(heads) == 0:
             return
@@ -300,13 +346,16 @@ def prune_heads(self, heads: Set[int]) -> None:
         self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
         self.pruned_heads = self.pruned_heads.union(heads)
 
+    # Taken from transformers.models.vit.modeling_vit.ViTAttention.prune_heads
+    # with the addition of `batch_size`
     def forward(
         self,
         hidden_states: torch.Tensor,
         head_mask: Optional[torch.Tensor] = None,
         output_attentions: bool = False,
+        batch_size: Optional[int] = None,
     ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
-        self_outputs = self.attention(hidden_states, head_mask, output_attentions)
+        self_outputs = self.attention(hidden_states, head_mask, output_attentions, batch_size)
 
         attention_output = self.output(self_outputs[0], hidden_states)
 
@@ -411,10 +460,10 @@ def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
 }
 
 
-# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2Layer with Dinov2Config->DepthProConfig, Dinov2->DepthProViT all-casing
 class DepthProViTLayer(nn.Module):
     """This corresponds to the Block class in the original implementation."""
 
+    # Copied from transformers.models.dinov2.modeling_dinov2.Dinov2Layer.__init__ with Dinov2Config->DepthProConfig, Dinov2->DepthProViT all-casing
     def __init__(self, config: DepthProConfig) -> None:
         super().__init__()
 
@@ -431,16 +480,23 @@ def __init__(self, config: DepthProConfig) -> None:
             self.mlp = DepthProViTMLP(config)
         self.layer_scale2 = DepthProViTLayerScale(config)
 
+    # Taken from transformers.models.dinov2.modeling_dinov2.Dinov2Layer.forward
+    # with the addition of `batch_size`
     def forward(
         self,
         hidden_states: torch.Tensor,
         head_mask: Optional[torch.Tensor] = None,
         output_attentions: bool = False,
+        batch_size: Optional[int] = None,
     ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        if batch_size is not None:
+            hidden_states = batch_to_patch(hidden_states)
+
         self_attention_outputs = self.attention(
             self.norm1(hidden_states),  # in DepthProViT, layernorm is applied before self-attention
             head_mask,
             output_attentions=output_attentions,
+            batch_size=batch_size,
         )
         attention_output = self_attention_outputs[0]
 
@@ -458,19 +514,24 @@ def forward(
         # second residual connection
         layer_output = self.drop_path(layer_output) + hidden_states
 
+        if batch_size is not None:
+            layer_output = patch_to_batch(layer_output, batch_size)
+
         outputs = (layer_output,) + outputs
 
         return outputs
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViTConfig->DepthProConfig, ViT->DepthProViT
 class DepthProViTEncoder(nn.Module):
+    # Copied from transformers.models.vit.modeling_vit.ViTEncoder.__init__ with ViTConfig->DepthProConfig, ViT->DepthProViT
     def __init__(self, config: DepthProConfig) -> None:
         super().__init__()
         self.config = config
         self.layer = nn.ModuleList([DepthProViTLayer(config) for _ in range(config.num_hidden_layers)])
         self.gradient_checkpointing = False
 
+    # Taken from transformers.models.vit.modeling_vit.ViTEncoder.__init__
+    # with the addition of `batch_size`
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -478,6 +539,7 @@ def forward(
         output_attentions: bool = False,
         output_hidden_states: bool = False,
         return_dict: bool = True,
+        batch_size: Optional[int] = None,
     ) -> Union[tuple, BaseModelOutput]:
         all_hidden_states = () if output_hidden_states else None
         all_self_attentions = () if output_attentions else None
@@ -494,9 +556,10 @@ def forward(
                     hidden_states,
                     layer_head_mask,
                     output_attentions,
+                    batch_size,
                 )
             else:
-                layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions)
+                layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions, batch_size)
 
             hidden_states = layer_outputs[0]
 
@@ -532,6 +595,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+        batch_size: Optional[int] = None,
     ) -> Union[Tuple, BaseModelOutput]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -542,7 +606,7 @@ def forward(
         if pixel_values is None:
             raise ValueError("You have to specify pixel_values")
 
-        embedding_output = self.embeddings(pixel_values)
+        embedding_output = self.embeddings(pixel_values, batch_size=batch_size)
 
         encoder_outputs = self.encoder(
             embedding_output,
@@ -550,6 +614,7 @@ def forward(
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
+            batch_size=batch_size,
         )
         sequence_output = encoder_outputs[0]
         sequence_output = self.layernorm(sequence_output)
@@ -871,9 +936,12 @@ def forward(
             output_attentions=output_attentions,
             output_hidden_states=True,  # required for intermediate features
             return_dict=True,
+            batch_size=B,
         )
+        last_hidden_state = patch_encodings.last_hidden_state
+        last_hidden_state = batch_to_patch(last_hidden_state)
         scaled_images_last_hidden_state = torch.split_with_sizes(
-            patch_encodings.last_hidden_state, scaled_images_num_patches[::-1]
+            last_hidden_state, scaled_images_num_patches[::-1]
         )[::-1]  # -1 as patch encoder expects high res patches first
 
         image_encodings = self.image_encoder(
@@ -917,6 +985,7 @@ def forward(
                 self.intermediate_hook_ids[i] + 1
             )  # +1 to correct index position as hidden_states contain embedding output as well
             hidden_state = patch_encodings.hidden_states[layer_id]
+            hidden_state = batch_to_patch(hidden_state)
             hidden_state = hidden_state[
                 : scaled_images_num_patches[-1]
             ]  # num_patches to be of same length as highest resolution
@@ -985,17 +1054,6 @@ def forward(
         hidden_states = patch_encodings.hidden_states if output_hidden_states else None
         attentions = patch_encodings.attentions if output_attentions else None
 
-        num_patches = sum(scaled_images_num_patches)
-        # [0, 3, 6], [1, 4, 7], [2, 5, 8] when num_patches=9 and B=3
-        indexes = torch.arange(num_patches).reshape(num_patches//B, -1).T
-        indexes = indexes.to(last_hidden_state.device)
-
-        last_hidden_state = last_hidden_state[indexes].mean(1)
-        if hidden_states is not None:
-            hidden_states = tuple([state[indexes].mean(1) for state in hidden_states])
-        if attentions is not None:
-            attentions = tuple([state[indexes].mean(1) for state in attentions])
-
         if not return_dict:
             return tuple(v for v in [last_hidden_state, features, hidden_states, attentions] if v is not None)
 
diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py
index 3d37965dcd1b..9e881cf273b7 100644
--- a/tests/models/depth_pro/test_modeling_depth_pro.py
+++ b/tests/models/depth_pro/test_modeling_depth_pro.py
@@ -131,7 +131,8 @@ def create_and_check_model(self, config, pixel_values, labels):
         model.to(torch_device)
         model.eval()
         result = model(pixel_values)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        num_patches = result.last_hidden_state.shape[1] # num_patches are created dynamically
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches, self.seq_length, self.hidden_size))
 
     def create_and_check_for_depth_estimation(self, config, pixel_values, labels):
         config.num_labels = self.num_labels

From 7cf2485adef235b906b469a38002a8dacc3d0537 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 4 Dec 2024 11:14:21 +0500
Subject: [PATCH 048/151] fix ruff formating

---
 .../depth_pro/image_processing_depth_pro.py   | 18 ++++++----
 .../models/depth_pro/modeling_depth_pro.py    | 36 +++++++++++--------
 .../depth_pro/test_modeling_depth_pro.py      |  8 +++--
 3 files changed, 38 insertions(+), 24 deletions(-)

diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py
index 65a29900c637..164c7e28c6e2 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py
@@ -157,13 +157,17 @@ def resize(
             raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
         output_size = (size["height"], size["width"])
 
-        return torch.nn.functional.interpolate(
-            # input should be (B, C, H, W)
-            input=torch.from_numpy(image).unsqueeze(0),
-            size=output_size,
-            mode=pil_torch_interpolation_mapping[resample].value,
-            antialias=antialias,
-        ).squeeze(0).numpy()
+        return (
+            torch.nn.functional.interpolate(
+                # input should be (B, C, H, W)
+                input=torch.from_numpy(image).unsqueeze(0),
+                size=output_size,
+                mode=pil_torch_interpolation_mapping[resample].value,
+                antialias=antialias,
+            )
+            .squeeze(0)
+            .numpy()
+        )
 
     def _validate_input_arguments(
         self,
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 6f20838375cf..8fa286c70919 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -51,6 +51,7 @@ def patch_to_batch(data: torch.Tensor, batch_size: int) -> torch.Tensor:
     data = data.transpose(0, 1)
     return data
 
+
 def batch_to_patch(data: torch.Tensor) -> torch.Tensor:
     """
     converts tensor from shape:
@@ -155,10 +156,10 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width:
         return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
 
     def forward(
-            self,
-            pixel_values: torch.Tensor,
-            batch_size: Optional[int] = None,
-        ) -> torch.Tensor:
+        self,
+        pixel_values: torch.Tensor,
+        batch_size: Optional[int] = None,
+    ) -> torch.Tensor:
         n, _, height, width = pixel_values.shape
         target_dtype = self.patch_embeddings.projection.weight.dtype
         embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype))
@@ -274,7 +275,10 @@ def forward(
                 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
             )
             return super().forward(
-                hidden_states=hidden_states, head_mask=head_mask, output_attentions=output_attentions, batch_size=batch_size,
+                hidden_states=hidden_states,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                batch_size=batch_size,
             )
 
         mixed_query_layer = self.query(hidden_states)
@@ -940,9 +944,9 @@ def forward(
         )
         last_hidden_state = patch_encodings.last_hidden_state
         last_hidden_state = batch_to_patch(last_hidden_state)
-        scaled_images_last_hidden_state = torch.split_with_sizes(
-            last_hidden_state, scaled_images_num_patches[::-1]
-        )[::-1]  # -1 as patch encoder expects high res patches first
+        scaled_images_last_hidden_state = torch.split_with_sizes(last_hidden_state, scaled_images_num_patches[::-1])
+        scaled_images_last_hidden_state = scaled_images_last_hidden_state[::-1]
+        # -1 as patch encoder expects high res patches first
 
         image_encodings = self.image_encoder(
             pixel_values=scaled_images[0],  # provide least resolution image
@@ -1610,12 +1614,16 @@ def forward(
         fused_features = self.fusion_stage(features)
         predicted_depth = self.head(fused_features)
 
-        fov = self.fov_model(
-            pixel_values=pixel_values,
-            # use lowest scaled image features for fov model
-            global_features=features[0].detach(),
-            head_mask=head_mask,
-        ) if self.use_fov_model else None
+        fov = (
+            self.fov_model(
+                pixel_values=pixel_values,
+                # use lowest scaled image features for fov model
+                global_features=features[0].detach(),
+                head_mask=head_mask,
+            )
+            if self.use_fov_model
+            else None
+        )
 
         if not return_dict:
             outputs = [loss, predicted_depth, fov, depth_pro_outputs.hidden_states, depth_pro_outputs.attentions]
diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py
index 9e881cf273b7..e350b067a118 100644
--- a/tests/models/depth_pro/test_modeling_depth_pro.py
+++ b/tests/models/depth_pro/test_modeling_depth_pro.py
@@ -91,7 +91,7 @@ def __init__(
         self.num_labels = num_labels
 
         self.num_patches = (patch_size // patch_embeddings_size) ** 2
-        self.seq_length = (patch_size // patch_embeddings_size) ** 2 + 1 # we add 1 for the [CLS] token
+        self.seq_length = (patch_size // patch_embeddings_size) ** 2 + 1  # we add 1 for the [CLS] token
 
     def prepare_config_and_inputs(self):
         pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
@@ -131,8 +131,10 @@ def create_and_check_model(self, config, pixel_values, labels):
         model.to(torch_device)
         model.eval()
         result = model(pixel_values)
-        num_patches = result.last_hidden_state.shape[1] # num_patches are created dynamically
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches, self.seq_length, self.hidden_size))
+        num_patches = result.last_hidden_state.shape[1]  # num_patches are created dynamically
+        self.parent.assertEqual(
+            result.last_hidden_state.shape, (self.batch_size, num_patches, self.seq_length, self.hidden_size)
+        )
 
     def create_and_check_for_depth_estimation(self, config, pixel_values, labels):
         config.num_labels = self.num_labels

From 0aa451df3e6862291d2097d5a1e6aa5e9aa91f23 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 4 Dec 2024 22:41:11 +0500
Subject: [PATCH 049/151] fix docstring failure

---
 .../models/depth_pro/modeling_depth_pro.py       | 16 +++++++++++++++-
 utils/check_docstrings.py                        |  1 -
 utils/check_repo.py                              |  1 -
 3 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 8fa286c70919..1498ce4003d3 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -1496,11 +1496,25 @@ class DepthProDepthEstimatorOutput(ModelOutput):
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 
 
+DEPTH_PRO_FOR_DEPTH_ESTIMATION_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`DepthProConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+        use_fov_model (`bool`, *optional*, defaults to `True`):
+            Whether to use `DepthProFOVModel` to generate the field of view.
+"""
+
+
 @add_start_docstrings(
     """
     DepthPro Model with a depth estimation head on top (consisting of 3 convolutional layers).
     """,
-    DEPTH_PRO_START_DOCSTRING,
+    DEPTH_PRO_FOR_DEPTH_ESTIMATION_START_DOCSTRING,
 )
 class DepthProForDepthEstimation(DepthProPreTrainedModel):
     def __init__(self, config, use_fov_model=None):
diff --git a/utils/check_docstrings.py b/utils/check_docstrings.py
index 34deed0df47e..0be960f4a33e 100644
--- a/utils/check_docstrings.py
+++ b/utils/check_docstrings.py
@@ -140,7 +140,6 @@
     "DPRReaderTokenizer",
     "DPRReaderTokenizerFast",
     "DPTModel",
-    "DepthProModel",
     "Data2VecAudioConfig",
     "Data2VecTextConfig",
     "Data2VecTextModel",
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 2e131e879153..10be5cdcd262 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -213,7 +213,6 @@
     "JukeboxPrior",
     "SamModel",
     "DPTForDepthEstimation",
-    "DepthProForDepthEstimation",
     "DecisionTransformerGPT2Model",
     "GLPNForDepthEstimation",
     "ViltForImagesAndTextClassification",

From 160afbf57789906a134000a5b6ee99982cf4ae6f Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 4 Dec 2024 23:32:39 +0500
Subject: [PATCH 050/151] use num_fov_head_layers in tests

---
 tests/models/depth_pro/test_modeling_depth_pro.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py
index e350b067a118..03f69e8ad1fe 100644
--- a/tests/models/depth_pro/test_modeling_depth_pro.py
+++ b/tests/models/depth_pro/test_modeling_depth_pro.py
@@ -64,6 +64,7 @@ def __init__(
         attention_probs_dropout_prob=0.1,
         initializer_range=0.02,
         use_fov_model=True,
+        num_fov_head_layers=0,
         num_labels=3,
     ):
         self.parent = parent
@@ -88,6 +89,7 @@ def __init__(
         self.attention_probs_dropout_prob = attention_probs_dropout_prob
         self.initializer_range = initializer_range
         self.use_fov_model = use_fov_model
+        self.num_fov_head_layers = num_fov_head_layers
         self.num_labels = num_labels
 
         self.num_patches = (patch_size // patch_embeddings_size) ** 2
@@ -124,6 +126,7 @@ def get_config(self):
             attention_probs_dropout_prob=self.attention_probs_dropout_prob,
             initializer_range=self.initializer_range,
             use_fov_model=self.use_fov_model,
+            num_fov_head_layers=self.num_fov_head_layers,
         )
 
     def create_and_check_model(self, config, pixel_values, labels):

From 9d2be2603d9a75346526b2a37711c6edc40125c8 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Thu, 5 Dec 2024 02:30:08 +0500
Subject: [PATCH 051/151] update doc

---
 docs/source/en/model_doc/depth_pro.md | 37 +++++++++++++++++----------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/docs/source/en/model_doc/depth_pro.md b/docs/source/en/model_doc/depth_pro.md
index 7e4ac13f1d64..041c4d49dffc 100644
--- a/docs/source/en/model_doc/depth_pro.md
+++ b/docs/source/en/model_doc/depth_pro.md
@@ -26,7 +26,7 @@ The abstract from the paper is the following:
 
 *We present a foundation model for zero-shot metric monocular depth estimation. Our model, Depth Pro, synthesizes high-resolution depth maps with unparalleled sharpness and high-frequency details. The predictions are metric, with absolute scale, without relying on the availability of metadata such as camera intrinsics. And the model is fast, producing a 2.25-megapixel depth map in 0.3 seconds on a standard GPU. These characteristics are enabled by a number of technical contributions, including an efficient multi-scale vision transformer for dense prediction, a training protocol that combines real and synthetic datasets to achieve high metric accuracy alongside fine boundary tracing, dedicated evaluation metrics for boundary accuracy in estimated depth maps, and state-of-the-art focal length estimation from a single image. Extensive experiments analyze specific design choices and demonstrate that Depth Pro outperforms prior work along multiple dimensions.*
 
-<img src="https://raw.githubusercontent.com/apple/ml-depth-pro/b2cd0d51daa95e49277a9f642f7fd736b7f9e91d/data/depth-pro-teaser.jpg"
+<img src="https://huggingface.co/geetu040/DepthPro/resolve/main/assets/architecture.jpg"
 alt="drawing" width="600"/>
 
 <small> DepthPro architecture. Taken from the <a href="https://arxiv.org/abs/2410.02073" target="_blank">original paper</a>. </small>
@@ -38,16 +38,26 @@ This model was contributed by [geetu040](https://github.com/geetu040). The origi
 ## Usage tips
 
 ```python
-from transformers import Dinov2Config, DepthProConfig, DepthProForDepthEstimation
+from transformers import DepthProConfig, DepthProForDepthEstimation
 
-# initialize with a Transformer-based backbone such as DINOv2
-# in that case, we also specify `reshape_hidden_states=False` to get feature maps of shape (batch_size, num_channels, height, width)
-backbone_config = Dinov2Config.from_pretrained("facebook/dinov2-base", out_features=["stage1", "stage2", "stage3", "stage4"], reshape_hidden_states=False)
+config = DepthProConfig()
+model = DPTForDepthEstimation(config=config)
+```
+
+- By default model takes an input image of size `1536`, this can be changed via config, however the model is compatible with images of different width and height.
+- Input image is scaled with different ratios, as specified in `scaled_images_ratios`, then each of the scaled image is patched to `patch_size` with an overlap ratio of `scaled_images_overlap_ratios`.
+- These patches go through `DinoV2 (ViT)` based encoders and are reassembled via a `DPT` based decoder.
+- `DepthProForDepthEstimation` can also predict the `FOV (Field of View)` if `use_fov_model` is set to `True` in the config.
+- `DepthProImageProcessor` can be used for preprocessing the inputs and postprocessing the outputs. `DepthProImageProcessor.post_process_depth_estimation` interpolates the `predicted_depth` back to match the input image size.
+- To generate `predicted_depth` of the same size as input image, make sure the config is created such that
+```
+image_size / 2**(n_fusion_blocks+1) == patch_size / patch_embeddings_size
 
-config = DepthProConfig(backbone_config=backbone_config)
-model = DepthProForDepthEstimation(config=config)
+where
+n_fusion_blocks = len(intermediate_hook_ids) + len(scaled_images_ratios)
 ```
 
+
 ### Using Scaled Dot Product Attention (SDPA)
 
 PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function 
@@ -59,9 +69,9 @@ page for more information.
 SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set 
 `attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used.
 
-```
-from transformers import ViTForImageClassification
-model = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224", attn_implementation="sdpa", torch_dtype=torch.float16)
+```py
+from transformers import DepthProForDepthEstimation
+model = DepthProForDepthEstimation.from_pretrained("geetu040/DepthPro", attn_implementation="sdpa", torch_dtype=torch.float16)
 ...
 ```
 
@@ -78,12 +88,11 @@ On a local benchmark (A100-40GB, PyTorch 2.3.0, OS Ubuntu 22.04) with `float32`
 
 ## Resources
 
-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DepthPro.
+- Research Paper: [Depth Pro: Sharp Monocular Metric Depth in Less Than a Second](https://arxiv.org/pdf/2410.02073)
 
-- Demo notebooks for [`DepthProForDepthEstimation`] can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/DepthPro).
+- Official Implementation: [apple/ml-depth-pro](https://github.com/apple/ml-depth-pro)
 
-- [Semantic segmentation task guide](../tasks/semantic_segmentation)
-- [Monocular depth estimation task guide](../tasks/monocular_depth_estimation)
+<!-- A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DepthPro. -->
 
 If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
 

From e208459cebe6b8f821aa14e0d9e7735466751daf Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Thu, 5 Dec 2024 02:38:32 +0500
Subject: [PATCH 052/151] check consistency with config

---
 .../models/depth_pro/modeling_depth_pro.py      | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 1498ce4003d3..605ea38ea736 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -838,6 +838,23 @@ def __init__(self, config: DepthProConfig) -> None:
                 f"by patch_embeddings_size={config.patch_embeddings_size}."
             )
 
+        # scaled_images_ratios, scaled_images_overlap_ratios, scaled_images_feature_dims are consistent
+        if not (len(config.scaled_images_ratios) == len(config.scaled_images_overlap_ratios) == len(config.scaled_images_feature_dims)):
+            raise ValueError(
+                f"len(scaled_images_ratios)={len(config.scaled_images_ratios)} and "
+                f"len(scaled_images_overlap_ratios)={len(config.scaled_images_overlap_ratios)} and "
+                f"len(scaled_images_feature_dims)={len(config.scaled_images_feature_dims)}, "
+                f"should match in config."
+            )
+
+        # intermediate_hook_ids, intermediate_feature_dims are consistent
+        if not (len(config.intermediate_hook_ids) == len(config.intermediate_feature_dims)):
+            raise ValueError(
+                f"len(intermediate_hook_ids)={len(config.intermediate_hook_ids)} and "
+                f"len(intermediate_feature_dims)={len(config.intermediate_feature_dims)}, "
+                f"should match in config."
+            )
+
         # patch encoder
         self.patch_encoder = DepthProViT(config)
 

From 0415722bd6dd44f4b7d56d0cacf8cdd3f958cb41 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Thu, 5 Dec 2024 02:42:31 +0500
Subject: [PATCH 053/151] ruff formatting

---
 src/transformers/models/depth_pro/modeling_depth_pro.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 605ea38ea736..040b9eb07962 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -839,7 +839,11 @@ def __init__(self, config: DepthProConfig) -> None:
             )
 
         # scaled_images_ratios, scaled_images_overlap_ratios, scaled_images_feature_dims are consistent
-        if not (len(config.scaled_images_ratios) == len(config.scaled_images_overlap_ratios) == len(config.scaled_images_feature_dims)):
+        if not (
+            len(config.scaled_images_ratios)
+            == len(config.scaled_images_overlap_ratios)
+            == len(config.scaled_images_feature_dims)
+        ):
             raise ValueError(
                 f"len(scaled_images_ratios)={len(config.scaled_images_ratios)} and "
                 f"len(scaled_images_overlap_ratios)={len(config.scaled_images_overlap_ratios)} and "

From f4e7404191244a86a91d5e93c3be82ffa7d6b970 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Thu, 5 Dec 2024 10:57:52 +0500
Subject: [PATCH 054/151] update test case

---
 tests/models/depth_pro/test_modeling_depth_pro.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py
index 03f69e8ad1fe..54c5e870a258 100644
--- a/tests/models/depth_pro/test_modeling_depth_pro.py
+++ b/tests/models/depth_pro/test_modeling_depth_pro.py
@@ -52,12 +52,12 @@ def __init__(
         use_labels=True,
         hidden_size=32,
         fusion_hidden_size=16,
-        intermediate_hook_ids=[1, 0],
-        intermediate_feature_dims=[8, 8],
+        intermediate_hook_ids=[0],
+        intermediate_feature_dims=[8],
         scaled_images_ratios=[0.5, 1.0],
         scaled_images_overlap_ratios=[0.0, 0.2],
         scaled_images_feature_dims=[12, 12],
-        num_hidden_layers=2,
+        num_hidden_layers=1,
         num_attention_heads=4,
         hidden_act="gelu",
         hidden_dropout_prob=0.1,
@@ -95,6 +95,9 @@ def __init__(
         self.num_patches = (patch_size // patch_embeddings_size) ** 2
         self.seq_length = (patch_size // patch_embeddings_size) ** 2 + 1  # we add 1 for the [CLS] token
 
+        n_fusion_blocks = len(intermediate_hook_ids) + len(scaled_images_ratios)
+        self.expected_depth_size = 2**(n_fusion_blocks+1) * patch_size / patch_embeddings_size
+
     def prepare_config_and_inputs(self):
         pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
 
@@ -145,7 +148,7 @@ def create_and_check_for_depth_estimation(self, config, pixel_values, labels):
         model.to(torch_device)
         model.eval()
         result = model(pixel_values)
-        self.parent.assertEqual(result.predicted_depth.shape, (self.batch_size, self.image_size, self.image_size))
+        self.parent.assertEqual(result.predicted_depth.shape, (self.batch_size, self.expected_depth_size, self.expected_depth_size))
 
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()

From 2c1cc10ee8ddefce3649dac81144e5095ee00ba8 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Thu, 5 Dec 2024 14:55:06 +0500
Subject: [PATCH 055/151] fix ruff formatting

---
 tests/models/depth_pro/test_modeling_depth_pro.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py
index 54c5e870a258..215756d45e99 100644
--- a/tests/models/depth_pro/test_modeling_depth_pro.py
+++ b/tests/models/depth_pro/test_modeling_depth_pro.py
@@ -96,7 +96,7 @@ def __init__(
         self.seq_length = (patch_size // patch_embeddings_size) ** 2 + 1  # we add 1 for the [CLS] token
 
         n_fusion_blocks = len(intermediate_hook_ids) + len(scaled_images_ratios)
-        self.expected_depth_size = 2**(n_fusion_blocks+1) * patch_size / patch_embeddings_size
+        self.expected_depth_size = 2 ** (n_fusion_blocks + 1) * patch_size / patch_embeddings_size
 
     def prepare_config_and_inputs(self):
         pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
@@ -148,7 +148,9 @@ def create_and_check_for_depth_estimation(self, config, pixel_values, labels):
         model.to(torch_device)
         model.eval()
         result = model(pixel_values)
-        self.parent.assertEqual(result.predicted_depth.shape, (self.batch_size, self.expected_depth_size, self.expected_depth_size))
+        self.parent.assertEqual(
+            result.predicted_depth.shape, (self.batch_size, self.expected_depth_size, self.expected_depth_size)
+        )
 
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()

From 871b80db318a8e8b2b70533acd62cbcec678cc74 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Fri, 6 Dec 2024 10:42:02 +0500
Subject: [PATCH 056/151] add tests for fov

---
 .../depth_pro/test_modeling_depth_pro.py      | 39 +++++++++++++++++--
 1 file changed, 35 insertions(+), 4 deletions(-)

diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py
index 215756d45e99..48983c9aca3a 100644
--- a/tests/models/depth_pro/test_modeling_depth_pro.py
+++ b/tests/models/depth_pro/test_modeling_depth_pro.py
@@ -63,8 +63,7 @@ def __init__(
         hidden_dropout_prob=0.1,
         attention_probs_dropout_prob=0.1,
         initializer_range=0.02,
-        use_fov_model=True,
-        num_fov_head_layers=0,
+        use_fov_model=False,
         num_labels=3,
     ):
         self.parent = parent
@@ -89,7 +88,6 @@ def __init__(
         self.attention_probs_dropout_prob = attention_probs_dropout_prob
         self.initializer_range = initializer_range
         self.use_fov_model = use_fov_model
-        self.num_fov_head_layers = num_fov_head_layers
         self.num_labels = num_labels
 
         self.num_patches = (patch_size // patch_embeddings_size) ** 2
@@ -129,7 +127,6 @@ def get_config(self):
             attention_probs_dropout_prob=self.attention_probs_dropout_prob,
             initializer_range=self.initializer_range,
             use_fov_model=self.use_fov_model,
-            num_fov_head_layers=self.num_fov_head_layers,
         )
 
     def create_and_check_model(self, config, pixel_values, labels):
@@ -152,6 +149,36 @@ def create_and_check_for_depth_estimation(self, config, pixel_values, labels):
             result.predicted_depth.shape, (self.batch_size, self.expected_depth_size, self.expected_depth_size)
         )
 
+    def create_and_check_for_fov(self, config, pixel_values, labels):
+        model = DepthProForDepthEstimation(config, use_fov_model=True)
+        model.to(torch_device)
+        model.eval()
+
+        # check if the fov_model (DinoV2-based encoder) is created
+        self.parent.assertIsNotNone(model.fov_model)
+
+        batched_pixel_values = pixel_values
+        row_pixel_values = pixel_values[:1]
+
+        with torch.no_grad():
+            model_batched_output_fov = model(batched_pixel_values).fov
+            model_row_output_fov = model(row_pixel_values).fov
+
+        # check if fov is returned
+        self.parent.assertIsNotNone(model_batched_output_fov)
+        self.parent.assertIsNotNone(model_row_output_fov)
+
+        # check output shape consistency for fov
+        self.parent.assertEqual(model_batched_output_fov.shape, (self.batch_size,))
+
+        # check equivalence between batched and single row outputs for fov
+        diff = torch.max(torch.abs(model_row_output_fov - model_batched_output_fov[:1]))
+        model_name = model.__class__.__name__
+        self.parent.assertTrue(
+            diff <= 1e-03,
+            msg=(f"Batched and Single row outputs are not equal in {model_name} for fov. " f"Difference={diff}."),
+        )
+
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         config, pixel_values, labels = config_and_inputs
@@ -208,6 +235,10 @@ def test_for_depth_estimation(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_depth_estimation(*config_and_inputs)
 
+    def test_for_fov(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_fov(*config_and_inputs)
+
     def test_training(self):
         for model_class in self.all_model_classes:
             if model_class.__name__ == "DepthProForDepthEstimation":

From 0ff06556163a39f90eede4d5e889554e46b9de46 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Fri, 6 Dec 2024 15:11:06 +0500
Subject: [PATCH 057/151] use interpolation in postprocess

---
 .../models/depth_pro/image_processing_depth_pro.py         | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py
index 164c7e28c6e2..228c3d992457 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py
@@ -393,10 +393,11 @@ def post_process_depth_estimation(
                     outputs["fov"].append(fov)
 
                 # interpolate
-                predicted_depth = self.resize(
-                    predicted_depth.unsqueeze(0).unsqueeze(1),
+                predicted_depth = torch.nn.functional.interpolate(
+                    # input should be (B, C, H, W)
+                    input=predicted_depth.unsqueeze(0).unsqueeze(1),
                     size=target_size,
-                    resample=self.resample,
+                    mode=pil_torch_interpolation_mapping[self.resample].value,
                     antialias=self.antialias,
                 ).squeeze()
 

From befa6cdbca6194a4fab82c9865bfb9deeebe54c7 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Fri, 6 Dec 2024 15:26:50 +0500
Subject: [PATCH 058/151] run and fix slow tests locally

---
 .../depth_pro/test_modeling_depth_pro.py      | 47 ++++++++++---------
 1 file changed, 25 insertions(+), 22 deletions(-)

diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py
index 48983c9aca3a..a3026801d593 100644
--- a/tests/models/depth_pro/test_modeling_depth_pro.py
+++ b/tests/models/depth_pro/test_modeling_depth_pro.py
@@ -94,7 +94,7 @@ def __init__(
         self.seq_length = (patch_size // patch_embeddings_size) ** 2 + 1  # we add 1 for the [CLS] token
 
         n_fusion_blocks = len(intermediate_hook_ids) + len(scaled_images_ratios)
-        self.expected_depth_size = 2 ** (n_fusion_blocks + 1) * patch_size / patch_embeddings_size
+        self.expected_depth_size = 2 ** (n_fusion_blocks + 1) * patch_size // patch_embeddings_size
 
     def prepare_config_and_inputs(self):
         pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
@@ -313,8 +313,8 @@ def test_initialization(self):
 
     @slow
     def test_model_from_pretrained(self):
-        model_name = "Intel/depth_pro-large"
-        model = DepthProModel.from_pretrained(model_name)
+        model_path = "geetu040/DepthPro"
+        model = DepthProModel.from_pretrained(model_path)
         self.assertIsNotNone(model)
 
 
@@ -329,8 +329,10 @@ def prepare_img():
 @slow
 class DepthProModelIntegrationTest(unittest.TestCase):
     def test_inference_depth_estimation(self):
-        image_processor = DepthProImageProcessor.from_pretrained("Intel/depth_pro-large")
-        model = DepthProForDepthEstimation.from_pretrained("Intel/depth_pro-large").to(torch_device)
+        model_path = "geetu040/DepthPro"
+        image_processor = DepthProImageProcessor.from_pretrained(model_path)
+        model = DepthProForDepthEstimation.from_pretrained(model_path).to(torch_device)
+        config = model.config
 
         image = prepare_img()
         inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
@@ -341,18 +343,21 @@ def test_inference_depth_estimation(self):
             predicted_depth = outputs.predicted_depth
 
         # verify the predicted depth
-        expected_shape = torch.Size((1, 384, 384))
+        n_fusion_blocks = len(config.intermediate_hook_ids) + len(config.scaled_images_ratios)
+        expected_depth_size = 2 ** (n_fusion_blocks + 1) * config.patch_size // config.patch_embeddings_size
+        expected_shape = torch.Size((1, expected_depth_size, expected_depth_size))
         self.assertEqual(predicted_depth.shape, expected_shape)
 
         expected_slice = torch.tensor(
-            [[6.3199, 6.3629, 6.4148], [6.3850, 6.3615, 6.4166], [6.3519, 6.3176, 6.3575]]
+            [[1.0582, 1.1225, 1.1335], [1.1154, 1.1398, 1.1486], [1.1434, 1.1500, 1.1643]]
         ).to(torch_device)
 
         self.assertTrue(torch.allclose(outputs.predicted_depth[0, :3, :3], expected_slice, atol=1e-4))
 
     def test_post_processing_depth_estimation(self):
-        image_processor = DepthProImageProcessor.from_pretrained("Intel/depth_pro-large")
-        model = DepthProForDepthEstimation.from_pretrained("Intel/depth_pro-large")
+        model_path = "geetu040/DepthPro"
+        image_processor = DepthProImageProcessor.from_pretrained(model_path)
+        model = DepthProForDepthEstimation.from_pretrained(model_path)
 
         image = prepare_img()
         inputs = image_processor(images=image, return_tensors="pt")
@@ -361,17 +366,15 @@ def test_post_processing_depth_estimation(self):
         with torch.no_grad():
             outputs = model(**inputs)
 
-        predicted_depth = image_processor.post_process_depth_estimation(outputs=outputs)[0]["predicted_depth"]
-        expected_shape = torch.Size((384, 384))
-        self.assertTrue(predicted_depth.shape == expected_shape)
-
-        predicted_depth_l = image_processor.post_process_depth_estimation(outputs=outputs, target_sizes=[(500, 500)])
-        predicted_depth_l = predicted_depth_l[0]["predicted_depth"]
-        expected_shape = torch.Size((500, 500))
-        self.assertTrue(predicted_depth_l.shape == expected_shape)
+        predicted_depth = outputs.predicted_depth
+        fov = outputs.fov
+        target_size = [[image.height, image.width]] * len(predicted_depth)
 
-        output_enlarged = torch.nn.functional.interpolate(
-            predicted_depth.unsqueeze(0).unsqueeze(1), size=(500, 500), mode="bicubic", align_corners=False
-        ).squeeze()
-        self.assertTrue(output_enlarged.shape == expected_shape)
-        self.assertTrue(torch.allclose(predicted_depth_l, output_enlarged, rtol=1e-3))
+        outputs = image_processor.post_process_depth_estimation(
+            predicted_depths=predicted_depth,
+            fovs=fov,
+            target_sizes=target_size,
+        )
+        predicted_depth = outputs["predicted_depth"][0]
+        expected_shape = torch.Size((image.height, image.width))
+        self.assertTrue(predicted_depth.shape == expected_shape)

From 99ac5e81cc98b9297a81af784bf227179f1609e3 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Thu, 12 Dec 2024 19:53:22 +0500
Subject: [PATCH 059/151] use scaled_images_features for image and fov encoder

---
 .../models/depth_pro/modeling_depth_pro.py    | 80 ++++++++++---------
 1 file changed, 43 insertions(+), 37 deletions(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 040b9eb07962..f77e24925c88 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -959,7 +959,8 @@ def forward(
             patches,
             head_mask=head_mask,
             output_attentions=output_attentions,
-            output_hidden_states=True,  # required for intermediate features
+            # required for intermediate features
+            output_hidden_states=self.n_intermediate_hooks or output_hidden_states,
             return_dict=True,
             batch_size=B,
         )
@@ -969,12 +970,16 @@ def forward(
         scaled_images_last_hidden_state = scaled_images_last_hidden_state[::-1]
         # -1 as patch encoder expects high res patches first
 
+        # scale the image to patch size for image_encoder
+        image_scaled_to_patch_size = nn.functional.interpolate(
+            pixel_values,
+            size=(self.config.patch_size, self.config.patch_size),
+            mode="bilinear",
+            align_corners=False,
+        )
         image_encodings = self.image_encoder(
-            pixel_values=scaled_images[0],  # provide least resolution image
+            pixel_values=image_scaled_to_patch_size,
             head_mask=head_mask,
-            output_attentions=False,
-            output_hidden_states=False,
-            return_dict=True,
         )
 
         # STEP 4: get patch features (high_res, med_res, low_res) - (3-5) in diagram
@@ -1041,19 +1046,15 @@ def forward(
         # a. extract hidden_state
         hidden_state = (
             image_encodings.last_hidden_state
-        )  # (scaled_images_num_patches[0], self.seq_len+1, config.hidden_size)
+        )  # (B, self.seq_len+1, config.hidden_size)
 
         # b. reshape back to image like
         image_features = reshape_feature(
             hidden_state, self.out_size, self.out_size
-        )  # (scaled_images_num_patches[0], config.hidden_size, self.out_size, self.out_size)
+        )  # (B, config.hidden_size, self.out_size, self.out_size)
 
         # c. merge patches back together
-        image_features = merge(
-            image_features,
-            batch_size=B,
-            merge_out_size=self.out_size * 2 ** (0),
-        )  # (B, config.hidden_size, self.out_size*2**(self.n_scaled_images-1), self.out_size*2**(self.n_scaled_images-1))
+        # no merge required for image_features as they are already in batches instead of patches
 
         # d. upsample
         image_features = self.upsample_image(
@@ -1073,8 +1074,6 @@ def forward(
             *intermediate_features,
         ]
 
-        # prepare last_hidden_state, hidden_states, attentions from patches to batches
-
         last_hidden_state = patch_encodings.last_hidden_state
         hidden_states = patch_encodings.hidden_states if output_hidden_states else None
         attentions = patch_encodings.attentions if output_attentions else None
@@ -1420,35 +1419,42 @@ def forward(
         B, C, W, H = pixel_values.shape
 
         # follow the steps same as with image features in DepthProEncoder
-        pixel_values = interpolate(
-            pixel_values,
-            scale_factor=self.config.scaled_images_ratios[0],  # same ratio as lowest ratioed image
-        )
-        patches = patch(
+        # except for the extra encoder_neck layer applied
+
+        image_scaled_to_patch_size = nn.functional.interpolate(
             pixel_values,
-            patch_size=self.config.patch_size,
-            overlap_ratio=self.config.scaled_images_overlap_ratios[0],
+            size=(self.config.patch_size, self.config.patch_size),
+            mode="bilinear",
+            align_corners=False,
         )
-        encoder_outputs = self.encoder(
-            patches,
+        encodings = self.encoder(
+            image_scaled_to_patch_size,
             head_mask=head_mask,
-            output_attentions=False,
-            output_hidden_states=False,
-            return_dict=True,
-        )
-        last_hidden_state = encoder_outputs.last_hidden_state
-        last_hidden_state = self.encoder_neck(last_hidden_state)
-        last_hidden_state = reshape_feature(last_hidden_state, width=self.out_size, height=self.out_size)
-        last_hidden_state = merge(
-            last_hidden_state,
-            batch_size=B,
-            merge_out_size=self.out_size,
         )
 
+        # a. extract hidden_state
+        hidden_state = (
+            encodings.last_hidden_state
+        )  # (B, self.seq_len+1, config.hidden_size)
+        # extra step
+        hidden_state = self.encoder_neck(hidden_state)
+        # (B, self.fusion_hidden_size//2, self.out_size, self.out_size)
+
+        # b. reshape back to image like
+        fov_features = reshape_feature(
+            hidden_state, self.out_size, self.out_size
+        )  # (B, config.hidden_size, self.out_size, self.out_size)
+
+        # c. merge patches back together
+        # no merge required for fov_features as they are already in batches instead of patches
+
+        # d. upsample
+        # no upsampling required for fov_features, the head later downsamples to create scalars
+
         global_features = self.global_neck(global_features)
 
-        last_hidden_state = last_hidden_state + global_features
-        fov_output = self.head(last_hidden_state)
+        fov_features = fov_features + global_features
+        fov_output = self.head(fov_features)
         fov_output = fov_output.reshape(B)
 
         return fov_output
@@ -1652,7 +1658,7 @@ def forward(
         fov = (
             self.fov_model(
                 pixel_values=pixel_values,
-                # use lowest scaled image features for fov model
+                # frozon features from encoder are used
                 global_features=features[0].detach(),
                 head_mask=head_mask,
             )

From ebb62dd2190a164d8f4cfbb218cd7c2099515ae1 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Thu, 12 Dec 2024 20:28:32 +0500
Subject: [PATCH 060/151] return fused_hidden_states in fusion stage

---
 .../models/depth_pro/modeling_depth_pro.py    | 27 ++++++++++++-------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index f77e24925c88..91758a3db485 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -765,7 +765,6 @@ def merge(patches, batch_size, merge_out_size):
         boxes.append(boxes_in_row)
 
     boxes = torch.cat(boxes, dim=-2)
-    boxes = boxes[..., :merge_out_size, :merge_out_size]
     return boxes
 
 
@@ -1303,7 +1302,7 @@ def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
 
 
 # Taken from transformers.models.dpt.modeling_dpt.DPTFeatureFusionLayer
-# except it uses deconv, skip_add and avoids interpolation (it always receives consitent inputs)
+# except it uses deconv annd skip_add
 class DepthProFeatureFusionLayer(nn.Module):
     def __init__(self, config: DepthProConfig, use_deconv: bool = True) -> None:
         super().__init__()
@@ -1328,6 +1327,10 @@ def __init__(self, config: DepthProConfig, use_deconv: bool = True) -> None:
 
     def forward(self, hidden_state, residual=None):
         if residual is not None:
+            if hidden_state.shape != residual.shape:
+                residual = nn.functional.interpolate(
+                    residual, size=(hidden_state.shape[2], hidden_state.shape[3]), mode="bilinear", align_corners=False
+                )
             hidden_state = self.skip_add.add(hidden_state, self.residual_layer1(residual))
 
         hidden_state = self.residual_layer2(hidden_state)
@@ -1357,13 +1360,17 @@ def forward(self, hidden_states):
                 f"doesnot match len(hidden_states)={len(hidden_states)}"
             )
 
-        # first layer only uses the last hidden_state
-        fused_hidden_state = self.layers[0](hidden_states[0])
-        # looping from the second layer to last layer
-        for hidden_state, layer in zip(hidden_states[1:], self.layers[1:]):
-            fused_hidden_state = layer(fused_hidden_state, hidden_state)
+        fused_hidden_states = []
+        fused_hidden_state = None
+        for hidden_state, layer in zip(hidden_states, self.layers):
+            if fused_hidden_state is None:
+                # first layer only uses the last hidden_state
+                fused_hidden_state = layer(hidden_state)
+            else:
+                fused_hidden_state = layer(fused_hidden_state, hidden_state)
+            fused_hidden_states.append(fused_hidden_state)
 
-        return fused_hidden_state
+        return fused_hidden_states
 
 
 class DepthProFOVModel(nn.Module):
@@ -1652,8 +1659,8 @@ def forward(
         )
         features = depth_pro_outputs.features
         features = [proj(feature) for proj, feature in zip(self.projections, features)]
-        fused_features = self.fusion_stage(features)
-        predicted_depth = self.head(fused_features)
+        fused_hidden_states = self.fusion_stage(features)
+        predicted_depth = self.head(fused_hidden_states[-1])
 
         fov = (
             self.fov_model(

From 46c88e8bd3ba4dc2331b81fad1a54a4b902445e7 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Thu, 12 Dec 2024 20:44:44 +0500
Subject: [PATCH 061/151] fix example

---
 .../models/depth_pro/modeling_depth_pro.py             | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 91758a3db485..8f1609b6fb15 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -1206,14 +1206,8 @@ def forward(
         >>> with torch.no_grad():
         ...     output = model(**inputs)
         ...
-        >>> for state in output.last_hidden_state:
-        ...     print(state.shape)
-        ...
-        torch.Size([1, 1024, 48, 48])
-        torch.Size([1, 1024, 96, 96])
-        torch.Size([1, 512, 192, 192])
-        torch.Size([1, 256, 384, 384])
-        torch.Size([1, 256, 768, 768])
+        >>> output.last_hidden_state.shape
+        torch.Size([1, 35, 577, 1024])
         ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (

From 243135880028d09441fb41440f760a9a2c329a33 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Thu, 12 Dec 2024 20:48:36 +0500
Subject: [PATCH 062/151] fix ruff

---
 src/transformers/models/depth_pro/modeling_depth_pro.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 8f1609b6fb15..bd6c811a1163 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -1043,9 +1043,7 @@ def forward(
         # STEP 6: get image features - (6) in diagram
 
         # a. extract hidden_state
-        hidden_state = (
-            image_encodings.last_hidden_state
-        )  # (B, self.seq_len+1, config.hidden_size)
+        hidden_state = image_encodings.last_hidden_state  # (B, self.seq_len+1, config.hidden_size)
 
         # b. reshape back to image like
         image_features = reshape_feature(
@@ -1434,9 +1432,7 @@ def forward(
         )
 
         # a. extract hidden_state
-        hidden_state = (
-            encodings.last_hidden_state
-        )  # (B, self.seq_len+1, config.hidden_size)
+        hidden_state = encodings.last_hidden_state  # (B, self.seq_len+1, config.hidden_size)
         # extra step
         hidden_state = self.encoder_neck(hidden_state)
         # (B, self.fusion_hidden_size//2, self.out_size, self.out_size)

From d9d3a49906bab33156ab97f8ebb7b2bd87d45a49 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sat, 21 Dec 2024 10:23:09 +0500
Subject: [PATCH 063/151] fix copyright license for all files

---
 docs/source/en/model_doc/depth_pro.md                           | 2 +-
 src/transformers/models/depth_pro/__init__.py                   | 2 +-
 src/transformers/models/depth_pro/configuration_depth_pro.py    | 2 +-
 .../models/depth_pro/convert_depth_pro_weights_to_hf.py         | 2 +-
 src/transformers/models/depth_pro/image_processing_depth_pro.py | 2 +-
 .../models/depth_pro/image_processing_depth_pro_fast.py         | 2 +-
 src/transformers/models/depth_pro/modeling_depth_pro.py         | 2 +-
 tests/models/depth_pro/test_image_processing_depth_pro.py       | 2 +-
 tests/models/depth_pro/test_modeling_depth_pro.py               | 2 +-
 9 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/docs/source/en/model_doc/depth_pro.md b/docs/source/en/model_doc/depth_pro.md
index 041c4d49dffc..9019547434af 100644
--- a/docs/source/en/model_doc/depth_pro.md
+++ b/docs/source/en/model_doc/depth_pro.md
@@ -1,4 +1,4 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
diff --git a/src/transformers/models/depth_pro/__init__.py b/src/transformers/models/depth_pro/__init__.py
index 1f2a6646c5c0..6fa380d64208 100644
--- a/src/transformers/models/depth_pro/__init__.py
+++ b/src/transformers/models/depth_pro/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py
index 46220a0731e6..a93c65a79969 100644
--- a/src/transformers/models/depth_pro/configuration_depth_pro.py
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
index cd06a99c5fb2..90d6c5de9cca 100644
--- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
+++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
@@ -1,4 +1,4 @@
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py
index 228c3d992457..fb31e9084abe 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
index 374d5c25cafc..12e56d69dfff 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index bd6c811a1163..bc946dceb314 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2023 Meta AI and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The Apple Research Team Authors and The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tests/models/depth_pro/test_image_processing_depth_pro.py b/tests/models/depth_pro/test_image_processing_depth_pro.py
index eea9ed01378d..e9d94151e145 100644
--- a/tests/models/depth_pro/test_image_processing_depth_pro.py
+++ b/tests/models/depth_pro/test_image_processing_depth_pro.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 HuggingFace Inc.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py
index a3026801d593..c6a22e90e4ac 100644
--- a/tests/models/depth_pro/test_modeling_depth_pro.py
+++ b/tests/models/depth_pro/test_modeling_depth_pro.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 8f4c61f324e9c2794ea97d3865d249a28a42213a Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sat, 21 Dec 2024 10:29:11 +0500
Subject: [PATCH 064/151] add __all__ for each file

---
 src/transformers/models/depth_pro/configuration_depth_pro.py    | 2 ++
 src/transformers/models/depth_pro/image_processing_depth_pro.py | 2 ++
 .../models/depth_pro/image_processing_depth_pro_fast.py         | 2 ++
 src/transformers/models/depth_pro/modeling_depth_pro.py         | 2 ++
 4 files changed, 8 insertions(+)

diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py
index a93c65a79969..7f0b54ef4f4b 100644
--- a/src/transformers/models/depth_pro/configuration_depth_pro.py
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@@ -168,3 +168,5 @@ def __init__(
         self.scaled_images_ratios = scaled_images_ratios
         self.scaled_images_overlap_ratios = scaled_images_overlap_ratios
         self.scaled_images_feature_dims = scaled_images_feature_dims
+
+__all__ = ["DepthProConfig"]
diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py
index fb31e9084abe..24104c0d5cbc 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py
@@ -407,3 +407,5 @@ def post_process_depth_estimation(
             outputs["predicted_depth"].append(predicted_depth)
 
         return outputs
+
+__all__ = ["DepthProImageProcessor"]
diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
index 12e56d69dfff..55f987aa48b7 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
@@ -380,3 +380,5 @@ def post_process_depth_estimation(
             outputs["predicted_depth"].append(predicted_depth)
 
         return outputs
+
+__all__ = ["DepthProImageProcessorFast"]
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index bc946dceb314..44ace2808039 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -1674,3 +1674,5 @@ def forward(
             hidden_states=depth_pro_outputs.hidden_states,
             attentions=depth_pro_outputs.attentions,
         )
+
+__all__ = ["DepthProPreTrainedModel", "DepthProModel", "DepthProForDepthEstimation"]

From 8960535068223dce8b65f75a98a1a557cbc3b31b Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sat, 21 Dec 2024 12:22:33 +0500
Subject: [PATCH 065/151] minor fixes - fix download spell - add push_to_hub
 option - fix Optional type hinting - apply single loop for
 DepthProImageProcessor.preprocess

---
 .../depth_pro/configuration_depth_pro.py      |  1 +
 .../convert_depth_pro_weights_to_hf.py        | 27 ++++++--
 .../depth_pro/image_processing_depth_pro.py   | 61 +++++++++----------
 .../image_processing_depth_pro_fast.py        |  5 +-
 .../models/depth_pro/modeling_depth_pro.py    |  1 +
 5 files changed, 55 insertions(+), 40 deletions(-)

diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py
index 7f0b54ef4f4b..402811789ee4 100644
--- a/src/transformers/models/depth_pro/configuration_depth_pro.py
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@@ -169,4 +169,5 @@ def __init__(
         self.scaled_images_overlap_ratios = scaled_images_overlap_ratios
         self.scaled_images_feature_dims = scaled_images_feature_dims
 
+
 __all__ = ["DepthProConfig"]
diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
index 90d6c5de9cca..f4895f7730c1 100644
--- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
+++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
@@ -182,7 +182,7 @@ def write_model(
     # Convert weights
     # ------------------------------------------------------------
 
-    # downlaod and load state_dict from hf repo
+    # download and load state_dict from hf repo
     file_path = hf_hub_download(hf_repo_id, "depth_pro.pt")
     # file_path = "/home/geetu/work/hf/depth_pro/depth_pro.pt" # when you already have the files locally
     loaded = torch.load(file_path, weights_only=True)
@@ -214,8 +214,9 @@ def write_model(
     # Safety check: reload the converted model
     gc.collect()
     print("Reloading the model to check if it's saved correctly.")
-    DepthProForDepthEstimation.from_pretrained(output_dir, torch_dtype=torch.bfloat16, device_map="auto")
+    model = DepthProForDepthEstimation.from_pretrained(output_dir, device_map="auto")
     print("Model reloaded successfully.")
+    return model
 
 
 def write_image_processor(output_dir: str):
@@ -231,6 +232,7 @@ def write_image_processor(output_dir: str):
         image_std=0.5,
     )
     image_processor.save_pretrained(output_dir)
+    return image_processor
 
 
 def main():
@@ -243,23 +245,38 @@ def main():
     parser.add_argument(
         "--output_dir",
         default="apple_DepthPro",
-        help="Location to write HF model and processor",
+        help="Location to write the converted model and processor",
     )
     parser.add_argument(
         "--safe_serialization", default=True, type=bool, help="Whether or not to save using `safetensors`."
     )
+    parser.add_argument(
+        "--push_to_hub",
+        default=True,
+        type=bool,
+        help="Whether or not to push the converted model to the huggingface hub.",
+    )
+    parser.add_argument(
+        "--hub_repo_id",
+        default="geetu040/DepthPro",
+        help="Huggingface hub repo to write the converted model and processor",
+    )
     args = parser.parse_args()
 
-    write_model(
+    model = write_model(
         hf_repo_id=args.hf_repo_id,
         output_dir=args.output_dir,
         safe_serialization=args.safe_serialization,
     )
 
-    write_image_processor(
+    image_processor = write_image_processor(
         output_dir=args.output_dir,
     )
 
+    if args.push_to_hub:
+        model.push_to_hub(args.hub_repo_id)
+        image_processor.push_to_hub(args.hub_repo_id)
+
 
 if __name__ == "__main__":
     main()
diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py
index 24104c0d5cbc..a2d7f861ca6e 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py
@@ -157,6 +157,11 @@ def resize(
             raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
         output_size = (size["height"], size["width"])
 
+        # we use torch interpolation instead of image.resize because DepthProImageProcessor
+        # rescales, then normalizes, which may cause some values to become negative, before resizing the image.
+        # image.resize expects all values to be in range [0, 1] or [0, 255] and throws an exception otherwise,
+        # however pytorch interpolation works with negative values.
+        # relevant issue here: https://github.com/huggingface/transformers/issues/34920
         return (
             torch.nn.functional.interpolate(
                 # input should be (B, C, H, W)
@@ -182,9 +187,6 @@ def _validate_input_arguments(
         image_std: Union[float, List[float]],
         data_format: Union[str, ChannelDimension],
     ):
-        if data_format != ChannelDimension.FIRST:
-            raise ValueError("Only channel first data format is currently supported.")
-
         if do_resize and None in (size, resample, antialias):
             raise ValueError("Size, resample and antialias must be specified if do_resize is True.")
 
@@ -199,8 +201,8 @@ def preprocess(
         self,
         images: ImageInput,
         do_resize: Optional[bool] = None,
-        size: Dict[str, int] = None,
-        resample: PILImageResampling = None,
+        size: Optional[Dict[str, int]] = None,
+        resample: Optional[PILImageResampling] = None,
         antialias: Optional[bool] = None,
         do_rescale: Optional[bool] = None,
         rescale_factor: Optional[float] = None,
@@ -302,36 +304,28 @@ def preprocess(
             # We assume that all images have the same channel dimension format.
             input_data_format = infer_channel_dimension_format(images[0])
 
-        if do_rescale:
-            images = [
-                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
-                for image in images
-            ]
-
-        if do_normalize:
-            images = [
-                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
-                for image in images
-            ]
-
-        images = [
-            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
-        ]
-
-        # depth-pro scales the image before resizing it
-        # uses torch interpolation which requires ChannelDimension.FIRST
-        if do_resize:
-            images = [
-                self.resize(
-                    image=image,
-                    size=size,
-                    resample=resample,
-                    antialias=antialias,
+        all_images = []
+        for image in images:
+            if do_rescale:
+                image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+
+            if do_normalize:
+                image = self.normalize(
+                    image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
                 )
-                for image in images
-            ]
 
-        data = {"pixel_values": images}
+            # depth-pro rescales and normalizes the image before resizing it
+            # uses torch interpolation which requires ChannelDimension.FIRST
+            if do_resize:
+                image = to_channel_dimension_format(image, ChannelDimension.FIRST, input_channel_dim=input_data_format)
+                image = self.resize(image=image, size=size, resample=resample, antialias=antialias)
+                image = to_channel_dimension_format(image, data_format, input_channel_dim=ChannelDimension.FIRST)
+            else:
+                image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+
+            all_images.append(image)
+
+        data = {"pixel_values": all_images}
         return BatchFeature(data=data, tensor_type=return_tensors)
 
     def post_process_depth_estimation(
@@ -408,4 +402,5 @@ def post_process_depth_estimation(
 
         return outputs
 
+
 __all__ = ["DepthProImageProcessor"]
diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
index 55f987aa48b7..2975cb04bb91 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
@@ -202,8 +202,8 @@ def preprocess(
         self,
         images: ImageInput,
         do_resize: Optional[bool] = None,
-        size: Dict[str, int] = None,
-        resample: PILImageResampling = None,
+        size: Optional[Dict[str, int]] = None,
+        resample: Optional[PILImageResampling] = None,
         antialias: Optional[bool] = None,
         do_rescale: Optional[bool] = None,
         rescale_factor: Optional[float] = None,
@@ -381,4 +381,5 @@ def post_process_depth_estimation(
 
         return outputs
 
+
 __all__ = ["DepthProImageProcessorFast"]
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 44ace2808039..31490868f207 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -1675,4 +1675,5 @@ def forward(
             attentions=depth_pro_outputs.attentions,
         )
 
+
 __all__ = ["DepthProPreTrainedModel", "DepthProModel", "DepthProForDepthEstimation"]

From 1ac1b84b391058434cd7a3e0e10f560d3881d532 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sat, 21 Dec 2024 22:30:45 +0500
Subject: [PATCH 066/151] return list in post_process_depth_estimation

---
 .../depth_pro/image_processing_depth_pro.py   | 65 +++++++++--------
 .../image_processing_depth_pro_fast.py        | 70 +++++++++----------
 2 files changed, 67 insertions(+), 68 deletions(-)

diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py
index a2d7f861ca6e..f80d70c934b9 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py
@@ -14,10 +14,13 @@
 # limitations under the License.
 """Image processor class for DepthPro."""
 
-from typing import Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 
+if TYPE_CHECKING:
+    from ...modeling_outputs import DepthProDepthEstimatorOutput
+
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
 from ...image_transforms import to_channel_dimension_format
 from ...image_utils import (
@@ -330,8 +333,7 @@ def preprocess(
 
     def post_process_depth_estimation(
         self,
-        predicted_depths: Union[TensorType, List[TensorType]],
-        fovs: Optional[Union[TensorType, List[TensorType], None]] = None,
+        outputs: "DepthProDepthEstimatorOutput",
         target_sizes: Optional[Union[TensorType, List[Tuple[int, int]], None]] = None,
     ) -> Dict[str, List[TensorType]]:
         """
@@ -340,22 +342,16 @@ def post_process_depth_estimation(
         and adjusts depth values accordingly.
 
         Args:
-            predicted_depths (`Union[TensorType, List[TensorType]]`):
-                Raw depth predictions output by the model. Can be a single tensor or a list of tensors, each
-                corresponding to an image in the batch.
-            fovs (`Optional[Union[TensorType, List[TensorType], None]]`, *optional*, defaults to `None`):
-                Field of view (FoV) values corresponding to each depth prediction. Should have the same length
-                as `predicted_depths` if provided. If `None`, FoV scaling is skipped.
+            outputs ([`DepthProDepthEstimatorOutput`]):
+                Raw outputs of the model.
             target_sizes (`Optional[Union[TensorType, List[Tuple[int, int]], None]]`, *optional*, defaults to `None`):
                 Target sizes to resize the depth predictions. Can be a tensor of shape `(batch_size, 2)`
                 or a list of tuples `(height, width)` for each image in the batch. If `None`, no resizing
                 is performed.
 
         Returns:
-            `Dict[str, List[TensorType]]`:
-                A dictionary containing:
-                    - `"predicted_depth"`: A list of processed depth tensors.
-                    - `"fov"`: A list of processed FoV values if provided, otherwise `None`.
+            `List[Dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth
+            predictions.
 
         Raises:
             `ValueError`:
@@ -363,44 +359,47 @@ def post_process_depth_estimation(
         """
         requires_backends(self, "torch")
 
-        if (fovs is not None) and (len(predicted_depths) != len(fovs)):
-            raise ValueError(
-                "Make sure that you pass in as many fov values as the batch dimension of the predicted depth"
-            )
-        if (target_sizes is not None) and (len(predicted_depths) != len(target_sizes)):
+        predicted_depth = outputs.predicted_depth
+        fov = outputs.fov
+
+        batch_size = len(predicted_depth)
+
+        if target_sizes is not None and batch_size != len(target_sizes):
             raise ValueError(
                 "Make sure that you pass in as many fov values as the batch dimension of the predicted depth"
             )
 
-        outputs = {"predicted_depth": [], "fov": [] if fovs is not None else None}
-
-        fovs = [None] * len(predicted_depths) if fovs is None else fovs
-        target_sizes = [None] * len(predicted_depths) if target_sizes is None else target_sizes
-
-        for predicted_depth, fov, target_size in zip(predicted_depths, fovs, target_sizes):
+        results = []
+        fov = [None] * batch_size if fov is None else fov
+        target_sizes = [None] * batch_size if target_sizes is None else target_sizes
+        for depth, fov_value, target_size in zip(predicted_depth, fov, target_sizes):
             if target_size is not None:
                 # scale image w.r.t fov
-                if fov is not None:
+                if fov_value is not None:
                     width = target_size[1]
-                    fov = 0.5 * width / torch.tan(0.5 * torch.deg2rad(fov))
-                    predicted_depth = predicted_depth * width / fov
-                    outputs["fov"].append(fov)
+                    fov_value = 0.5 * width / torch.tan(0.5 * torch.deg2rad(fov_value))
+                    depth = depth * width / fov_value
 
                 # interpolate
-                predicted_depth = torch.nn.functional.interpolate(
+                depth = torch.nn.functional.interpolate(
                     # input should be (B, C, H, W)
-                    input=predicted_depth.unsqueeze(0).unsqueeze(1),
+                    input=depth.unsqueeze(0).unsqueeze(1),
                     size=target_size,
                     mode=pil_torch_interpolation_mapping[self.resample].value,
                     antialias=self.antialias,
                 ).squeeze()
 
             # inverse the depth
-            predicted_depth = 1.0 / torch.clamp(predicted_depth, min=1e-4, max=1e4)
+            depth = 1.0 / torch.clamp(depth, min=1e-4, max=1e4)
 
-            outputs["predicted_depth"].append(predicted_depth)
+            results.append(
+                {
+                    "predicted_depth": depth,
+                    "fov": fov_value,
+                }
+            )
 
-        return outputs
+        return results
 
 
 __all__ = ["DepthProImageProcessor"]
diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
index 2975cb04bb91..637594f594d4 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
@@ -15,7 +15,10 @@
 """Fast Image processor class for DepthPro."""
 
 import functools
-from typing import Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Dict, List, Tuple, Optional, Union
+
+if TYPE_CHECKING:
+    from ...modeling_outputs import DepthProDepthEstimatorOutput
 
 from ...image_processing_base import BatchFeature
 from ...image_processing_utils import get_size_dict
@@ -307,11 +310,11 @@ def preprocess(
         data = {"pixel_values": torch.stack(transformed_images, dim=0)}
         return BatchFeature(data, tensor_type=return_tensors)
 
+    # Copied from transformers.models.depth_pro.image_processing_depth_pro.DepthProImageProcessor.post_process_depth_estimation
     def post_process_depth_estimation(
         self,
-        predicted_depths: Union[TensorType, List[TensorType]],
-        fovs: Optional[Union[TensorType, List[TensorType], None]] = None,
-        target_sizes: Optional[Union[TensorType, List[tuple[int, int]], None]] = None,
+        outputs: "DepthProDepthEstimatorOutput",
+        target_sizes: Optional[Union[TensorType, List[Tuple[int, int]], None]] = None,
     ) -> Dict[str, List[TensorType]]:
         """
         Post-processes the raw depth predictions from the model to generate final depth predictions and optionally
@@ -319,22 +322,16 @@ def post_process_depth_estimation(
         and adjusts depth values accordingly.
 
         Args:
-            predicted_depths (`Union[TensorType, List[TensorType]]`):
-                Raw depth predictions output by the model. Can be a single tensor or a list of tensors, each
-                corresponding to an image in the batch.
-            fovs (`Optional[Union[TensorType, List[TensorType], None]]`, *optional*, defaults to `None`):
-                Field of view (FoV) values corresponding to each depth prediction. Should have the same length
-                as `predicted_depths` if provided. If `None`, FoV scaling is skipped.
-            target_sizes (`Optional[Union[TensorType, List[tuple[int, int]], None]]`, *optional*, defaults to `None`):
+            outputs ([`DepthProDepthEstimatorOutput`]):
+                Raw outputs of the model.
+            target_sizes (`Optional[Union[TensorType, List[Tuple[int, int]], None]]`, *optional*, defaults to `None`):
                 Target sizes to resize the depth predictions. Can be a tensor of shape `(batch_size, 2)`
                 or a list of tuples `(height, width)` for each image in the batch. If `None`, no resizing
                 is performed.
 
         Returns:
-            `Dict[str, List[TensorType]]`:
-                A dictionary containing:
-                    - `"predicted_depth"`: A list of processed depth tensors.
-                    - `"fov"`: A list of processed FoV values if provided, otherwise `None`.
+            `List[Dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth
+            predictions.
 
         Raises:
             `ValueError`:
@@ -342,44 +339,47 @@ def post_process_depth_estimation(
         """
         requires_backends(self, "torch")
 
-        if (fovs is not None) and (len(predicted_depths) != len(fovs)):
-            raise ValueError(
-                "Make sure that you pass in as many fov values as the batch dimension of the predicted depth"
-            )
-        if (target_sizes is not None) and (len(predicted_depths) != len(target_sizes)):
+        predicted_depth = outputs.predicted_depth
+        fov = outputs.fov
+
+        batch_size = len(predicted_depth)
+
+        if target_sizes is not None and batch_size != len(target_sizes):
             raise ValueError(
                 "Make sure that you pass in as many fov values as the batch dimension of the predicted depth"
             )
 
-        outputs = {"predicted_depth": [], "fov": [] if fovs is not None else None}
-
-        fovs = [None] * len(predicted_depths) if fovs is None else fovs
-        target_sizes = [None] * len(predicted_depths) if target_sizes is None else target_sizes
-
-        for predicted_depth, fov, target_size in zip(predicted_depths, fovs, target_sizes):
+        results = []
+        fov = [None] * batch_size if fov is None else fov
+        target_sizes = [None] * batch_size if target_sizes is None else target_sizes
+        for depth, fov_value, target_size in zip(predicted_depth, fov, target_sizes):
             if target_size is not None:
                 # scale image w.r.t fov
-                if fov is not None:
+                if fov_value is not None:
                     width = target_size[1]
-                    fov = 0.5 * width / torch.tan(0.5 * torch.deg2rad(fov))
-                    predicted_depth = predicted_depth * width / fov
-                    outputs["fov"].append(fov)
+                    fov_value = 0.5 * width / torch.tan(0.5 * torch.deg2rad(fov_value))
+                    depth = depth * width / fov_value
 
                 # interpolate
-                predicted_depth = torch.nn.functional.interpolate(
+                depth = torch.nn.functional.interpolate(
                     # input should be (B, C, H, W)
-                    input=predicted_depth.unsqueeze(0).unsqueeze(1),
+                    input=depth.unsqueeze(0).unsqueeze(1),
                     size=target_size,
                     mode=pil_torch_interpolation_mapping[self.resample].value,
                     antialias=self.antialias,
                 ).squeeze()
 
             # inverse the depth
-            predicted_depth = 1.0 / torch.clamp(predicted_depth, min=1e-4, max=1e4)
+            depth = 1.0 / torch.clamp(depth, min=1e-4, max=1e4)
 
-            outputs["predicted_depth"].append(predicted_depth)
+            results.append(
+                {
+                    "predicted_depth": depth,
+                    "fov": fov_value,
+                }
+            )
 
-        return outputs
+        return results
 
 
 __all__ = ["DepthProImageProcessorFast"]

From 27bff69e0e5f886409b3ea3bd3200a53a1f90a00 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sat, 21 Dec 2024 23:50:38 +0500
Subject: [PATCH 067/151] minor fixes - capitalize start of docstring - use
 ignore copy - fix examples - move docstring templates and custom output
 classes to top - remove "-> None" typehinting from __init__ - type hinting
 for forward passes - fix docstrings for custom output classes

---
 .../depth_pro/image_processing_depth_pro.py   |   2 +-
 .../image_processing_depth_pro_fast.py        |   2 +-
 .../models/depth_pro/modeling_depth_pro.py    | 303 +++++++++---------
 3 files changed, 153 insertions(+), 154 deletions(-)

diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py
index f80d70c934b9..158dd08270c6 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py
@@ -100,7 +100,7 @@ def __init__(
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         **kwargs,
-    ) -> None:
+    ):
         super().__init__(**kwargs)
         size = size if size is not None else {"height": 1536, "width": 1536}
         size = get_size_dict(size)
diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
index 637594f594d4..8d625ebc0c0f 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
@@ -110,7 +110,7 @@ def __init__(
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         **kwargs,
-    ) -> None:
+    ):
         super().__init__(**kwargs)
         size = size if size is not None else {"height": 1536, "width": 1536}
         size = get_size_dict(size)
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 31490868f207..e23cfbdc9f50 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -42,9 +42,117 @@
 _CONFIG_FOR_DOC = "DepthProConfig"
 
 
+DEPTH_PRO_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`DepthProConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+DEPTH_PRO_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`DPTImageProcessor.__call__`]
+            for details.
+
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+"""
+
+DEPTH_PRO_FOR_DEPTH_ESTIMATION_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`DepthProConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+        use_fov_model (`bool`, *optional*, defaults to `True`):
+            Whether to use `DepthProFOVModel` to generate the field of view.
+"""
+
+
+@dataclass
+class DepthProOutput(ModelOutput):
+    """
+    Base class for DepthPro's outputs.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_patches, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        features (`List[torch.FloatTensor]`, *optional*:
+            Features from scaled images and hidden_states.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, num_patches, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer and the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_patches, num_heads, patch_size,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    last_hidden_state: torch.FloatTensor = None
+    features: Optional[List[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+
+
+@dataclass
+class DepthProDepthEstimatorOutput(ModelOutput):
+    """
+    Base class for DepthProForDepthEstimation's output.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Classification (or regression if config.num_labels==1) loss.
+        predicted_depth (`torch.FloatTensor` of shape `(batch_size, height, width)`):
+            Predicted depth for each pixel.
+        fov (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned when `use_fov_model` is provided):
+            Field of View Scaler.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, num_patches, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer and the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_patches, num_heads, patch_size,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    predicted_depth: torch.FloatTensor = None
+    fov: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+
+
 def patch_to_batch(data: torch.Tensor, batch_size: int) -> torch.Tensor:
     """
-    converts tensor from shape:
+    Converts tensor from shape:
     (num_patches, seq_len, hidden_size) -> (batch_size, num_patches_per_batch, seq_len, hidden_size)
     """
     data = data.reshape(-1, batch_size, *data.shape[1:])
@@ -54,7 +162,7 @@ def patch_to_batch(data: torch.Tensor, batch_size: int) -> torch.Tensor:
 
 def batch_to_patch(data: torch.Tensor) -> torch.Tensor:
     """
-    converts tensor from shape:
+    Converts tensor from shape:
     (batch_size, num_patches_per_batch, seq_len, hidden_size) -> (num_patches, seq_len, hidden_size)
     """
     data = data.transpose(0, 1)
@@ -62,12 +170,16 @@ def batch_to_patch(data: torch.Tensor) -> torch.Tensor:
     return data
 
 
+# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2PatchEmbeddings with Dinov2->DepthProViT
 class DepthProViTPatchEmbeddings(nn.Module):
     """
-    Copied from transformers.models.dinov2.modeling_dinov2.Dinov2PatchEmbeddings
-    with addition of config parameter patch_embeddings_size
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
     """
 
+    # Ignore copy
+    # addition of config parameter patch_embeddings_size
     def __init__(self, config):
         super().__init__()
 
@@ -84,7 +196,6 @@ def __init__(self, config):
             stride=(self.patch_embeddings_size, self.patch_embeddings_size),
         )
 
-    # Copied from transformers.models.dinov2.modeling_dinov2.Dinov2PatchEmbeddings.forward
     def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
         num_channels = pixel_values.shape[1]
         if num_channels != self.num_channels:
@@ -103,7 +214,7 @@ class DepthProViTEmbeddings(nn.Module):
     and enabling dynamic embeddings.
     """
 
-    def __init__(self, config: DepthProConfig) -> None:
+    def __init__(self, config: DepthProConfig):
         super().__init__()
 
         self.config = config
@@ -179,8 +290,8 @@ def forward(
         return embeddings
 
 
+# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViTConfig->DepthProConfig, ViT->DepthProViT
 class DepthProViTSelfAttention(nn.Module):
-    # Copied from transformers.models.vit.modeling_vit.ViTSelfAttention.__init__ with ViT->DepthPro
     def __init__(self, config: DepthProConfig) -> None:
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
@@ -199,14 +310,13 @@ def __init__(self, config: DepthProConfig) -> None:
 
         self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
 
-    # Copied from transformers.models.vit.modeling_vit.ViTSelfAttention.transpose_for_scores with ViT->DepthPro
     def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
         new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
         x = x.view(new_x_shape)
         return x.permute(0, 2, 1, 3)
 
-    # Taken from transformers.models.vit.modeling_vit.ViTSelfAttention.forward with ViT->DepthPro
-    # with the addition of `batch_size`
+    # Ignore copy
+    # addition of parameter batch_size
     def forward(
         self,
         hidden_states,
@@ -253,14 +363,14 @@ def forward(
         return outputs
 
 
+# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2SdpaSelfAttention with Dinov2Config->DepthProConfig, Dinov2->DepthProViT
 class DepthProViTSdpaSelfAttention(DepthProViTSelfAttention):
-    # Copied from transformers.models.dinov2.modeling_dinov2.Dinov2SdpaSelfAttention.__init__ with Dinov2Config->DepthProConfig, Dinov2->DepthProViT
     def __init__(self, config: DepthProConfig) -> None:
         super().__init__(config)
         self.attention_probs_dropout_prob = config.attention_probs_dropout_prob
 
-    # Taken from transformers.models.dinov2.modeling_dinov2.Dinov2SdpaSelfAttention.forward with Dinov2Config->DepthProConfig, Dinov2->DepthProViT
-    # with the addition of `batch_size`
+    # Ignore copy
+    # addition of `batch_size`
     def forward(
         self,
         hidden_states,
@@ -323,15 +433,14 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         return hidden_states
 
 
+# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViTConfig->DepthProConfig, ViT->DepthProViT
 class DepthProViTAttention(nn.Module):
-    # Copied from transformers.models.vit.modeling_vit.ViTAttention.__init__ with ViTConfig->DepthProConfig, ViT->DepthProViT
     def __init__(self, config: DepthProConfig) -> None:
         super().__init__()
         self.attention = DepthProViTSelfAttention(config)
         self.output = DepthProViTSelfOutput(config)
         self.pruned_heads = set()
 
-    # Copied from transformers.models.vit.modeling_vit.ViTAttention.prune_heads
     def prune_heads(self, heads: Set[int]) -> None:
         if len(heads) == 0:
             return
@@ -350,8 +459,8 @@ def prune_heads(self, heads: Set[int]) -> None:
         self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
         self.pruned_heads = self.pruned_heads.union(heads)
 
-    # Taken from transformers.models.vit.modeling_vit.ViTAttention.prune_heads
-    # with the addition of `batch_size`
+    # Ignore copy
+    # addition of `batch_size`
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -464,10 +573,10 @@ def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
 }
 
 
+# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2Layer with Dinov2Config->DepthProConfig, Dinov2->DepthProViT all-casing
 class DepthProViTLayer(nn.Module):
     """This corresponds to the Block class in the original implementation."""
 
-    # Copied from transformers.models.dinov2.modeling_dinov2.Dinov2Layer.__init__ with Dinov2Config->DepthProConfig, Dinov2->DepthProViT all-casing
     def __init__(self, config: DepthProConfig) -> None:
         super().__init__()
 
@@ -484,8 +593,8 @@ def __init__(self, config: DepthProConfig) -> None:
             self.mlp = DepthProViTMLP(config)
         self.layer_scale2 = DepthProViTLayerScale(config)
 
-    # Taken from transformers.models.dinov2.modeling_dinov2.Dinov2Layer.forward
-    # with the addition of `batch_size`
+    # Ignore copy
+    # addition of `batch_size`
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -526,16 +635,16 @@ def forward(
         return outputs
 
 
+# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViTConfig->DepthProConfig, ViT->DepthProViT
 class DepthProViTEncoder(nn.Module):
-    # Copied from transformers.models.vit.modeling_vit.ViTEncoder.__init__ with ViTConfig->DepthProConfig, ViT->DepthProViT
     def __init__(self, config: DepthProConfig) -> None:
         super().__init__()
         self.config = config
         self.layer = nn.ModuleList([DepthProViTLayer(config) for _ in range(config.num_hidden_layers)])
         self.gradient_checkpointing = False
 
-    # Taken from transformers.models.vit.modeling_vit.ViTEncoder.__init__
-    # with the addition of `batch_size`
+    # Ignore copy
+    # addition of `batch_size`
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -643,7 +752,7 @@ def __init__(
         n_upsample_layers,
         use_proj=True,
         bias=False,
-    ) -> None:
+    ):
         super().__init__()
 
         # create first projection block
@@ -768,37 +877,8 @@ def merge(patches, batch_size, merge_out_size):
     return boxes
 
 
-@dataclass
-class DepthProOutput(ModelOutput):
-    """
-    Base class for DepthPro's outputs.
-
-    Args:
-        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        features (`List[torch.FloatTensor]`, *optional*:
-            Features from scaled images and hidden_states.
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    last_hidden_state: torch.FloatTensor = None
-    features: Optional[List[torch.FloatTensor]] = None
-    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
-    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
-
-
 class DepthProEncoder(nn.Module):
-    def __init__(self, config: DepthProConfig) -> None:
+    def __init__(self, config: DepthProConfig):
         super().__init__()
         self.config = config
         self.hidden_size = config.hidden_size
@@ -1112,40 +1192,6 @@ def _init_weights(self, module):
             module.weight.data.fill_(1.0)
 
 
-DEPTH_PRO_START_DOCSTRING = r"""
-    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
-    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
-    behavior.
-
-    Parameters:
-        config ([`DepthProConfig`]): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-DEPTH_PRO_INPUTS_DOCSTRING = r"""
-    Args:
-        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`DPTImageProcessor.__call__`]
-            for details.
-
-        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
-            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
 @add_start_docstrings(
     "The bare DepthPro Model transformer outputting raw hidden-states without any specific head on top.",
     DEPTH_PRO_START_DOCSTRING,
@@ -1190,20 +1236,20 @@ def forward(
         >>> from PIL import Image
         >>> import requests
         >>> from transformers import AutoProcessor, DepthProModel
-        >>>
+
         >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
-        >>>
+
         >>> checkpoint = "geetu040/DepthPro"
         >>> processor = AutoProcessor.from_pretrained(checkpoint)
         >>> model = DepthProModel.from_pretrained(checkpoint)
-        >>>
+
         >>> # prepare image for the model
         >>> inputs = processor(images=image, return_tensors="pt")
-        >>>
+
         >>> with torch.no_grad():
         ...     output = model(**inputs)
-        ...
+
         >>> output.last_hidden_state.shape
         torch.Size([1, 35, 577, 1024])
         ```"""
@@ -1296,7 +1342,7 @@ def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
 # Taken from transformers.models.dpt.modeling_dpt.DPTFeatureFusionLayer
 # except it uses deconv annd skip_add
 class DepthProFeatureFusionLayer(nn.Module):
-    def __init__(self, config: DepthProConfig, use_deconv: bool = True) -> None:
+    def __init__(self, config: DepthProConfig, use_deconv: bool = True):
         super().__init__()
         self.config = config
         self.use_deconv = use_deconv
@@ -1317,7 +1363,7 @@ def __init__(self, config: DepthProConfig, use_deconv: bool = True) -> None:
         self.projection = nn.Conv2d(config.fusion_hidden_size, config.fusion_hidden_size, kernel_size=1, bias=True)
         self.skip_add = nn.quantized.FloatFunctional()
 
-    def forward(self, hidden_state, residual=None):
+    def forward(self, hidden_state: torch.Tensor, residual: Optional[torch.Tensor] = None) -> torch.Tensor:
         if residual is not None:
             if hidden_state.shape != residual.shape:
                 residual = nn.functional.interpolate(
@@ -1345,7 +1391,7 @@ def __init__(self, config, num_layers):
         # final layer doesnot require deconvolution
         self.layers.append(DepthProFeatureFusionLayer(config, use_deconv=False))
 
-    def forward(self, hidden_states):
+    def forward(self, hidden_states: List[torch.Tensor]) -> List[torch.Tensor]:
         if self.num_layers != len(hidden_states):
             raise ValueError(
                 f"num_layers={self.num_layers} in DepthProFeatureFusionStage"
@@ -1366,7 +1412,7 @@ def forward(self, hidden_states):
 
 
 class DepthProFOVModel(nn.Module):
-    def __init__(self, config: DepthProConfig) -> None:
+    def __init__(self, config: DepthProConfig):
         super().__init__()
         self.config = config
         self.hidden_size = config.hidden_size
@@ -1487,53 +1533,6 @@ def forward(self, hidden_states: List[torch.Tensor]) -> torch.Tensor:
         return predicted_depth
 
 
-@dataclass
-class DepthProDepthEstimatorOutput(ModelOutput):
-    """
-    Base class for DepthProForDepthEstimation's output.
-
-    Args:
-        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
-            Classification (or regression if config.num_labels==1) loss.
-        predicted_depth (`torch.FloatTensor` of shape `(batch_size, height, width)`):
-            Predicted depth for each pixel.
-        fov (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned when `use_fov_model` is provided):
-            Field of View Scaler.
-
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, num_channels, height, width)`.
-
-            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    loss: Optional[torch.FloatTensor] = None
-    predicted_depth: torch.FloatTensor = None
-    fov: Optional[torch.FloatTensor] = None
-    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
-    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
-
-
-DEPTH_PRO_FOR_DEPTH_ESTIMATION_START_DOCSTRING = r"""
-    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
-    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
-    behavior.
-
-    Parameters:
-        config ([`DepthProConfig`]): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-        use_fov_model (`bool`, *optional*, defaults to `True`):
-            Whether to use `DepthProFOVModel` to generate the field of view.
-"""
-
-
 @add_start_docstrings(
     """
     DepthPro Model with a depth estimation head on top (consisting of 3 convolutional layers).
@@ -1605,27 +1604,27 @@ def forward(
         >>> import torch
         >>> from PIL import Image
         >>> import requests
-        >>>
+
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
-        >>>
+
         >>> checkpoint = "geetu040/DepthPro"
         >>> processor = AutoImageProcessor.from_pretrained(checkpoint)
         >>> model = DepthProForDepthEstimation.from_pretrained(checkpoint)
-        >>>
+
         >>> # prepare image for the model
         >>> inputs = processor(images=image, return_tensors="pt")
-        >>>
+
         >>> with torch.no_grad():
         ...     outputs = model(**inputs)
-        ...
+
         >>> # interpolate to original size
         >>> post_processed_output = processor.post_process_depth_estimation(
-        ...     outputs.predicted_depth, outputs.fov, target_sizes=[(image.height, image.width)],
+        ...     outputs, target_sizes=[(image.height, image.width)],
         ... )
-        >>>
+
         >>> # visualize the prediction
-        >>> predicted_depth = post_processed_output["predicted_depth"][0]
+        >>> predicted_depth = post_processed_output[0]["predicted_depth"]
         >>> depth = predicted_depth * 255 / predicted_depth.max()
         >>> depth = depth.detach().cpu().numpy()
         >>> depth = Image.fromarray(depth.astype("uint8"))

From a69b5afbf98e5c3e3d4c93b230cfb24419bdfda9 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sat, 21 Dec 2024 23:56:42 +0500
Subject: [PATCH 068/151] fix "ruff check"

---
 .../models/depth_pro/image_processing_depth_pro.py             | 1 +
 .../models/depth_pro/image_processing_depth_pro_fast.py        | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py
index 158dd08270c6..76a12577dd63 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py
@@ -18,6 +18,7 @@
 
 import numpy as np
 
+
 if TYPE_CHECKING:
     from ...modeling_outputs import DepthProDepthEstimatorOutput
 
diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
index 8d625ebc0c0f..521e5b8a0628 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
@@ -15,7 +15,8 @@
 """Fast Image processor class for DepthPro."""
 
 import functools
-from typing import TYPE_CHECKING, Dict, List, Tuple, Optional, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
+
 
 if TYPE_CHECKING:
     from ...modeling_outputs import DepthProDepthEstimatorOutput

From 365a71df0d15e32713d9ef3d8063e2b241d33219 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sun, 22 Dec 2024 02:44:27 +0500
Subject: [PATCH 069/151] update upsample and projection

---
 .../convert_depth_pro_weights_to_hf.py        |  33 +--
 .../models/depth_pro/modeling_depth_pro.py    | 220 ++++++++++--------
 2 files changed, 138 insertions(+), 115 deletions(-)

diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
index f4895f7730c1..15c063ca377a 100644
--- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
+++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
@@ -59,25 +59,25 @@
     r"fov.head.head.(\d+).(weight|bias)":                                       r"fov_model.head.\1.\2",
 
     # upsamples (hard coded; regex is not very feasible here)
-    "encoder.upsample_latent0.0.weight":                                        "depth_pro.encoder.upsample_intermediate.1.proj.weight",
-    "encoder.upsample_latent0.1.weight":                                        "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.0.weight",
-    "encoder.upsample_latent0.2.weight":                                        "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.1.weight",
-    "encoder.upsample_latent0.3.weight":                                        "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.2.weight",
-    "encoder.upsample_latent1.0.weight":                                        "depth_pro.encoder.upsample_intermediate.0.proj.weight",
-    "encoder.upsample_latent1.1.weight":                                        "depth_pro.encoder.upsample_intermediate.0.upsample_blocks.0.weight",
-    "encoder.upsample_latent1.2.weight":                                        "depth_pro.encoder.upsample_intermediate.0.upsample_blocks.1.weight",
-    "encoder.upsample0.0.weight":                                               "depth_pro.encoder.upsample_scaled_images.2.proj.weight",
-    "encoder.upsample0.1.weight":                                               "depth_pro.encoder.upsample_scaled_images.2.upsample_blocks.0.weight",
-    "encoder.upsample1.0.weight":                                               "depth_pro.encoder.upsample_scaled_images.1.proj.weight",
-    "encoder.upsample1.1.weight":                                               "depth_pro.encoder.upsample_scaled_images.1.upsample_blocks.0.weight",
-    "encoder.upsample2.0.weight":                                               "depth_pro.encoder.upsample_scaled_images.0.proj.weight",
-    "encoder.upsample2.1.weight":                                               "depth_pro.encoder.upsample_scaled_images.0.upsample_blocks.0.weight",
-    "encoder.upsample_lowres.weight":                                           "depth_pro.encoder.upsample_image.upsample_blocks.0.weight",
-    "encoder.upsample_lowres.bias":                                             "depth_pro.encoder.upsample_image.upsample_blocks.0.bias",
+    "encoder.upsample_latent0.0.weight":                                        "depth_pro.encoder.feature_upsample.upsample_blocks.5.0.weight",
+    "encoder.upsample_latent0.1.weight":                                        "depth_pro.encoder.feature_upsample.upsample_blocks.5.1.weight",
+    "encoder.upsample_latent0.2.weight":                                        "depth_pro.encoder.feature_upsample.upsample_blocks.5.2.weight",
+    "encoder.upsample_latent0.3.weight":                                        "depth_pro.encoder.feature_upsample.upsample_blocks.5.3.weight",
+    "encoder.upsample_latent1.0.weight":                                        "depth_pro.encoder.feature_upsample.upsample_blocks.4.0.weight",
+    "encoder.upsample_latent1.1.weight":                                        "depth_pro.encoder.feature_upsample.upsample_blocks.4.1.weight",
+    "encoder.upsample_latent1.2.weight":                                        "depth_pro.encoder.feature_upsample.upsample_blocks.4.2.weight",
+    "encoder.upsample0.0.weight":                                               "depth_pro.encoder.feature_upsample.upsample_blocks.3.0.weight",
+    "encoder.upsample0.1.weight":                                               "depth_pro.encoder.feature_upsample.upsample_blocks.3.1.weight",
+    "encoder.upsample1.0.weight":                                               "depth_pro.encoder.feature_upsample.upsample_blocks.2.0.weight",
+    "encoder.upsample1.1.weight":                                               "depth_pro.encoder.feature_upsample.upsample_blocks.2.1.weight",
+    "encoder.upsample2.0.weight":                                               "depth_pro.encoder.feature_upsample.upsample_blocks.1.0.weight",
+    "encoder.upsample2.1.weight":                                               "depth_pro.encoder.feature_upsample.upsample_blocks.1.1.weight",
+    "encoder.upsample_lowres.weight":                                           "depth_pro.encoder.feature_upsample.upsample_blocks.0.0.weight",
+    "encoder.upsample_lowres.bias":                                             "depth_pro.encoder.feature_upsample.upsample_blocks.0.0.bias",
 
     # projections between encoder and fusion
     r"decoder.convs.(\d+).weight": lambda match: (
-        f"projections.{4-int(match.group(1))}.weight"
+        f"depth_pro.encoder.feature_projection.projections.{4-int(match.group(1))}.weight"
     ),
 
     # fusion stage
@@ -274,6 +274,7 @@ def main():
     )
 
     if args.push_to_hub:
+        print("Pushing to hub...")
         model.push_to_hub(args.hub_repo_id)
         image_processor.push_to_hub(args.hub_repo_id)
 
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index e23cfbdc9f50..c24ffce7bf93 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -743,21 +743,60 @@ def forward(
         )
 
 
-class DepthProUpsampleBlock(nn.Module):
-    def __init__(
-        self,
-        input_dims,
-        intermediate_dims,
-        output_dims,
-        n_upsample_layers,
-        use_proj=True,
-        bias=False,
-    ):
+class DepthProFeatureUpsample(nn.Module):
+    def __init__(self, config: DepthProConfig):
         super().__init__()
+        self.config = config
+
+        self.upsample_blocks = nn.ModuleList()
+
+        # for image_features
+        self.upsample_blocks.append(
+            self._create_upsample_block(
+                input_dims=config.hidden_size,
+                intermediate_dims=config.hidden_size,
+                output_dims=config.scaled_images_feature_dims[0],
+                n_upsample_layers=1,
+                use_proj=False,
+                bias=True,
+            )
+        )
+
+        # for scaled_images_features
+        for i, feature_dims in enumerate(config.scaled_images_feature_dims):
+            upsample_block = self._create_upsample_block(
+                input_dims=config.hidden_size,
+                intermediate_dims=feature_dims,
+                output_dims=feature_dims,
+                n_upsample_layers=1,
+            )
+            self.upsample_blocks.append(upsample_block)
+
+        # for intermediate_features
+        for i, feature_dims in enumerate(config.intermediate_feature_dims):
+            intermediate_dims = config.fusion_hidden_size if i == 0 else feature_dims
+            upsample_block = self._create_upsample_block(
+                input_dims=config.hidden_size,
+                intermediate_dims=intermediate_dims,
+                output_dims=feature_dims,
+                n_upsample_layers=2 + i,
+            )
+            self.upsample_blocks.append(upsample_block)
 
-        # create first projection block
+    def _create_upsample_block(
+        self,
+        input_dims: int,
+        intermediate_dims: int,
+        output_dims: int,
+        n_upsample_layers: int,
+        use_proj: bool = True,
+        bias: bool = False,
+    ) -> nn.Module:
+        upsample_block = nn.Sequential()
+
+        # create first projection layer
         if use_proj:
-            self.proj = nn.Conv2d(
+            proj = nn.Conv2d(
                 in_channels=input_dims,
                 out_channels=intermediate_dims,
                 kernel_size=1,
@@ -765,11 +804,9 @@ def __init__(
                 padding=0,
                 bias=bias,
             )
-        else:
-            self.proj = nn.Identity()
+            upsample_block.append(proj)
 
-        # create following upsample blocks
-        self.upsample_blocks = nn.Sequential()
+        # create following upsample layers
         for i in range(n_upsample_layers):
             in_channels = intermediate_dims if i == 0 else output_dims
             layer = nn.ConvTranspose2d(
@@ -780,11 +817,47 @@ def __init__(
                 padding=0,
                 bias=bias,
             )
-            self.upsample_blocks.append(layer)
+            upsample_block.append(layer)
+
+        return upsample_block
+
+    def forward(self, features: List[torch.Tensor]) -> List[torch.Tensor]:
+        upsampled_features = []
+        for i, upsample_block in enumerate(self.upsample_blocks):
+            upsampled_feature = upsample_block(features[i])
+            upsampled_features.append(upsampled_feature)
+        return upsampled_features
+
+
+class DepthProFeatureProjection(nn.Module):
+    def __init__(self, config: DepthProConfig):
+        super().__init__()
+        self.config = config
+
+        combined_feature_dims = config.scaled_images_feature_dims + config.intermediate_feature_dims
+        self.projections = nn.ModuleList()
+        for i, in_channels in enumerate(combined_feature_dims):
+            if i == len(combined_feature_dims) - 1 and in_channels == config.fusion_hidden_size:
+                # projection for last layer can be ignored if input and output channels already match
+                self.projections.append(nn.Identity())
+            else:
+                self.projections.append(
+                    nn.Conv2d(
+                        in_channels=in_channels,
+                        out_channels=config.fusion_hidden_size,
+                        kernel_size=3,
+                        stride=1,
+                        padding=1,
+                        bias=False,
+                    )
+                )
 
-    def forward(self, features):
-        projected = self.proj(features)
-        return self.upsample_blocks(projected)
+    def forward(self, features: List[torch.Tensor]) -> List[torch.Tensor]:
+        projected_features = []
+        for i, projection in enumerate(self.projections):
+            upsampled_feature = projection(features[i])
+            projected_features.append(upsampled_feature)
+        return projected_features
 
 
 def interpolate(pixel_values, scale_factor):
@@ -944,38 +1017,8 @@ def __init__(self, config: DepthProConfig):
         # image encoder
         self.image_encoder = DepthProViT(config)
 
-        # upsampling patch features (high_res, med_res, low_res) - (3-5) in diagram
-        self.upsample_scaled_images = nn.ModuleList()
-        for i, feature_dims in enumerate(self.scaled_images_feature_dims):
-            upsample_block = DepthProUpsampleBlock(
-                input_dims=config.hidden_size,
-                intermediate_dims=feature_dims,
-                output_dims=feature_dims,
-                n_upsample_layers=1,
-            )
-            self.upsample_scaled_images.append(upsample_block)
-
-        # upsampling intermediate features - (1-2) in diagram
-        self.upsample_intermediate = nn.ModuleList()
-        for i, feature_dims in enumerate(self.intermediate_feature_dims):
-            intermediate_dims = self.fusion_hidden_size if i == 0 else feature_dims
-            upsample_block = DepthProUpsampleBlock(
-                input_dims=config.hidden_size,
-                intermediate_dims=intermediate_dims,
-                output_dims=feature_dims,
-                n_upsample_layers=2 + i,
-            )
-            self.upsample_intermediate.append(upsample_block)
-
-        # upsampling image features - (6) in diagram
-        self.upsample_image = DepthProUpsampleBlock(
-            input_dims=config.hidden_size,
-            intermediate_dims=config.hidden_size,
-            output_dims=config.scaled_images_feature_dims[0],
-            n_upsample_layers=1,
-            use_proj=False,
-            bias=True,
-        )
+        # upsample features
+        self.feature_upsample = DepthProFeatureUpsample(config)
 
         # for STEP 7: fuse low_res and image features
         self.fuse_image_with_low_res = nn.Conv2d(
@@ -987,6 +1030,9 @@ def __init__(self, config: DepthProConfig):
             bias=True,
         )
 
+        # project features
+        self.feature_projection = DepthProFeatureProjection(config)
+
     def forward(
         self,
         pixel_values: torch.Tensor,
@@ -1079,10 +1125,6 @@ def forward(
                 features, batch_size=B, merge_out_size=self.out_size * 2**i
             )  # (B, config.hidden_size, self.out_size*2**i, self.out_size*2**i)
 
-            # d. upsample
-            features = self.upsample_scaled_images[i](features)
-            # (B, self.scaled_images_feature_dims[i], self.out_size*2**(i+1), self.out_size*2**(i+1))
-
             scaled_images_features.append(features)
 
         # STEP 5: get intermediate features - (1-2) in diagram
@@ -1114,10 +1156,6 @@ def forward(
                 merge_out_size=self.out_size * 2 ** (self.n_scaled_images - 1),
             )  # (B, config.hidden_size, self.out_size*2**(self.n_scaled_images-1), self.out_size*2**(self.n_scaled_images-1))
 
-            # d. upsample
-            features = self.upsample_intermediate[i](features)
-            # (B, config.intermediate_feature_dims[i], self.out_size*2**(self.n_scaled_images+i+1), self.out_size*2**(self.n_scaled_images+i+1))
-
             intermediate_features.append(features)
 
         # STEP 6: get image features - (6) in diagram
@@ -1133,24 +1171,30 @@ def forward(
         # c. merge patches back together
         # no merge required for image_features as they are already in batches instead of patches
 
-        # d. upsample
-        image_features = self.upsample_image(
-            image_features
-        )  # (B, config.scaled_images_feature_dims[0], self.out_size*2**1, self.out_size*2**1)
-
-        # STEP 7: apply fusion (global_features = image_features + scaled_images_features[0])
-        # fuses image_features with lowest resolution features as they are of same size
-        scaled_images_features[0] = torch.cat((scaled_images_features[0], image_features), dim=1)
-        scaled_images_features[0] = self.fuse_image_with_low_res(scaled_images_features[0])
-
-        # STEP 8: return these features in order of increasing size as what fusion expects
+        # STEP 7: combine all features
         features = [
+            image_features,
             # (B, self.scaled_images_feature_dims[i], self.out_size*2**(i+1), self.out_size*2**(i+1))
             *scaled_images_features,
             # (B, config.intermediate_feature_dims[i], self.out_size*2**(self.n_scaled_images+i+1), self.out_size*2**(self.n_scaled_images+i+1))
             *intermediate_features,
         ]
 
+        # STEP 8: upsample features
+        features = self.feature_upsample(features)
+
+        # STEP 9: apply fusion
+        # (global features = low res features + image features)
+        # fuses image_features with lowest resolution features as they are of same size
+        global_features = torch.cat((features[1], features[0]), dim=1)
+        global_features = self.fuse_image_with_low_res(global_features)
+        features = [global_features, *features[2:]]
+
+        # STEP 10: project features
+        features = self.feature_projection(features)
+
+        # STEP 11: return output
+
         last_hidden_state = patch_encodings.last_hidden_state
         hidden_states = patch_encodings.hidden_states if output_hidden_states else None
         attentions = patch_encodings.attentions if output_attentions else None
@@ -1380,11 +1424,13 @@ def forward(self, hidden_state: torch.Tensor, residual: Optional[torch.Tensor] =
 
 
 # Take from transformers.models.dpt.modeling_dpt.DPTFeatureFusionStage with DPT->DepthPro
-# with num_layers, deconv and reversed layers
+# with deconv and reversed layers
 class DepthProFeatureFusionStage(nn.Module):
-    def __init__(self, config, num_layers):
+    def __init__(self, config):
         super().__init__()
-        self.num_layers = num_layers
+        self.config = config
+
+        self.num_layers = len(config.intermediate_hook_ids) + len(config.scaled_images_ratios)
         self.layers = nn.ModuleList()
         for _ in range(self.num_layers - 1):
             self.layers.append(DepthProFeatureFusionLayer(config))
@@ -1491,9 +1537,6 @@ def forward(
         # c. merge patches back together
         # no merge required for fov_features as they are already in batches instead of patches
 
-        # d. upsample
-        # no upsampling required for fov_features, the head later downsamples to create scalars
-
         global_features = self.global_neck(global_features)
 
         fov_features = fov_features + global_features
@@ -1548,28 +1591,8 @@ def __init__(self, config, use_fov_model=None):
         # dinov2 (vit) like encoders
         self.depth_pro = DepthProModel(config)
 
-        # project hidden states from encoder to match expected inputs in fusion stage
-        combined_feature_dims = config.scaled_images_feature_dims + config.intermediate_feature_dims
-        self.projections = nn.ModuleList()
-        for i, in_channels in enumerate(combined_feature_dims):
-            if i == len(combined_feature_dims) - 1 and in_channels == config.fusion_hidden_size:
-                # projection for last layer can be ignored if input and output channels already match
-                self.projections.append(nn.Identity())
-            else:
-                self.projections.append(
-                    nn.Conv2d(
-                        in_channels=in_channels,
-                        out_channels=config.fusion_hidden_size,
-                        kernel_size=3,
-                        stride=1,
-                        padding=1,
-                        bias=False,
-                    )
-                )
-
         # dpt (vit) like fusion stage
-        self.num_fusion_layers = len(config.intermediate_hook_ids) + len(config.scaled_images_ratios)
-        self.fusion_stage = DepthProFeatureFusionStage(config, num_layers=self.num_fusion_layers)
+        self.fusion_stage = DepthProFeatureFusionStage(config)
 
         # depth estimation head
         self.head = DepthProDepthEstimationHead(config)
@@ -1647,7 +1670,6 @@ def forward(
             return_dict=True,
         )
         features = depth_pro_outputs.features
-        features = [proj(feature) for proj, feature in zip(self.projections, features)]
         fused_hidden_states = self.fusion_stage(features)
         predicted_depth = self.head(fused_hidden_states[-1])
 

From c00946819e1d172f78a0b023bee6a413cd065109 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Tue, 24 Dec 2024 11:28:35 +0500
Subject: [PATCH 070/151] major changes: (image size and merge optimization) -
 add support for images of any size - optimize merge operation - remove
 image_size from config - use full names instead of B, C, H, W - remove
 interpolation from fusion stage - add interpolation after merge - move
 validations to config - update integration test - add type hints for
 functions

---
 .../depth_pro/configuration_depth_pro.py      |  44 ++-
 .../models/depth_pro/modeling_depth_pro.py    | 314 +++++++++---------
 .../depth_pro/test_modeling_depth_pro.py      |  11 +-
 3 files changed, 190 insertions(+), 179 deletions(-)

diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py
index 402811789ee4..206c01eff191 100644
--- a/src/transformers/models/depth_pro/configuration_depth_pro.py
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@@ -53,11 +53,6 @@ class DepthProConfig(PretrainedConfig):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-06):
             The epsilon used by the layer normalization layers.
-        image_size (`int`, *optional*, defaults to 1536):
-            The size (resolution) of each image,
-            To generate depth of same size as image,
-            image_size / 2**(n_fusion_blocks+1) == patch_size / patch_embeddings_size
-            where n_fusion_blocks = len(intermediate_hook_ids) + len(scaled_images_ratios)
         patch_size (`int`, *optional*, defaults to 384):
             The size (resolution) of each patch.
         num_channels (`int`, *optional*, defaults to 3):
@@ -120,7 +115,6 @@ def __init__(
         attention_probs_dropout_prob=0.0,
         initializer_range=0.02,
         layer_norm_eps=1e-6,
-        image_size=1536,
         patch_size=384,
         num_channels=3,
         patch_embeddings_size=16,
@@ -141,6 +135,43 @@ def __init__(
     ):
         super().__init__(**kwargs)
 
+        # scaled_images_ratios is sorted
+        if scaled_images_ratios != sorted(scaled_images_ratios):
+            raise ValueError(
+                f"Values in scaled_images_ratios={scaled_images_ratios} " "should be sorted from low to high"
+            )
+
+        # patch_size should be a divisible by patch_embeddings_size
+        # else it raises an exception in DepthProViTPatchEmbeddings
+        if patch_size % patch_embeddings_size != 0:
+            raise ValueError(
+                f"patch_size={patch_size} should be divisible " f"by patch_embeddings_size={patch_embeddings_size}."
+            )
+
+        # scaled_images_ratios, scaled_images_overlap_ratios, scaled_images_feature_dims should be consistent
+        if not (len(scaled_images_ratios) == len(scaled_images_overlap_ratios) == len(scaled_images_feature_dims)):
+            raise ValueError(
+                f"len(scaled_images_ratios)={len(scaled_images_ratios)} and "
+                f"len(scaled_images_overlap_ratios)={len(scaled_images_overlap_ratios)} and "
+                f"len(scaled_images_feature_dims)={len(scaled_images_feature_dims)}, "
+                f"should match in config."
+            )
+
+        # intermediate_hook_ids, intermediate_feature_dims should be consistent
+        if not (len(intermediate_hook_ids) == len(intermediate_feature_dims)):
+            raise ValueError(
+                f"len(intermediate_hook_ids)={len(intermediate_hook_ids)} and "
+                f"len(intermediate_feature_dims)={len(intermediate_feature_dims)}, "
+                f"should match in config."
+            )
+
+        # fusion_hidden_size should be consistent with num_fov_head_layers
+        if fusion_hidden_size // 2**num_fov_head_layers == 0:
+            raise ValueError(
+                f"fusion_hidden_size={fusion_hidden_size} should be consistent with num_fov_head_layers={num_fov_head_layers} "
+                "i.e fusion_hidden_size // 2**num_fov_head_layers > 0"
+            )
+
         self.hidden_size = hidden_size
         self.fusion_hidden_size = fusion_hidden_size
         self.num_hidden_layers = num_hidden_layers
@@ -151,7 +182,6 @@ def __init__(
         self.attention_probs_dropout_prob = attention_probs_dropout_prob
         self.initializer_range = initializer_range
         self.layer_norm_eps = layer_norm_eps
-        self.image_size = image_size
         self.patch_size = patch_size
         self.num_channels = num_channels
         self.patch_embeddings_size = patch_embeddings_size
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index c24ffce7bf93..633d765b49f3 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -95,17 +95,17 @@ class DepthProOutput(ModelOutput):
     Base class for DepthPro's outputs.
 
     Args:
-        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_patches, sequence_length, hidden_size)`):
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, n_patches_per_batch, sequence_length, hidden_size)`):
             Sequence of hidden-states at the output of the last layer of the model.
         features (`List[torch.FloatTensor]`, *optional*:
             Features from scaled images and hidden_states.
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, num_patches, sequence_length, hidden_size)`.
+            one for the output of each layer) of shape `(batch_size, n_patches_per_batch, sequence_length, hidden_size)`.
 
             Hidden-states of the model at the output of each layer and the optional initial embedding outputs.
         attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_patches, num_heads, patch_size,
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, n_patches_per_batch, num_heads, sequence_length,
             sequence_length)`.
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
@@ -132,11 +132,11 @@ class DepthProDepthEstimatorOutput(ModelOutput):
             Field of View Scaler.
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, num_patches, sequence_length, hidden_size)`.
+            one for the output of each layer) of shape `(batch_size, n_patches_per_batch, sequence_length, hidden_size)`.
 
             Hidden-states of the model at the output of each layer and the optional initial embedding outputs.
         attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_patches, num_heads, patch_size,
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, n_patches_per_batch, num_heads, sequence_length,
             sequence_length)`.
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
@@ -153,7 +153,7 @@ class DepthProDepthEstimatorOutput(ModelOutput):
 def patch_to_batch(data: torch.Tensor, batch_size: int) -> torch.Tensor:
     """
     Converts tensor from shape:
-    (num_patches, seq_len, hidden_size) -> (batch_size, num_patches_per_batch, seq_len, hidden_size)
+    (num_patches, seq_len, hidden_size) -> (batch_size, n_patches_per_batch, seq_len, hidden_size)
     """
     data = data.reshape(-1, batch_size, *data.shape[1:])
     data = data.transpose(0, 1)
@@ -163,7 +163,7 @@ def patch_to_batch(data: torch.Tensor, batch_size: int) -> torch.Tensor:
 def batch_to_patch(data: torch.Tensor) -> torch.Tensor:
     """
     Converts tensor from shape:
-    (batch_size, num_patches_per_batch, seq_len, hidden_size) -> (num_patches, seq_len, hidden_size)
+    (batch_size, n_patches_per_batch, seq_len, hidden_size) -> (num_patches, seq_len, hidden_size)
     """
     data = data.transpose(0, 1)
     data = data.reshape(-1, *data.shape[2:])
@@ -860,94 +860,100 @@ def forward(self, features: List[torch.Tensor]) -> List[torch.Tensor]:
         return projected_features
 
 
-def interpolate(pixel_values, scale_factor):
+def interpolate(
+    pixel_values: torch.Tensor, size: Optional[int] = None, scale_factor: Optional[List[float]] = None
+) -> torch.Tensor:
     return nn.functional.interpolate(
         pixel_values,
-        size=None,
+        size=size,
         scale_factor=scale_factor,
         mode="bilinear",
         align_corners=False,
     )
 
 
-def patch(pixel_values, patch_size, overlap_ratio):
+def patch(pixel_values: torch.Tensor, patch_size: int, overlap_ratio: float) -> torch.Tensor:
     """Creates Patches from Batch."""
-    B, C, W, H = pixel_values.shape
+    batch_size, num_channels, height, width = pixel_values.shape
 
-    if W == H == patch_size:
+    if height == width == patch_size:
         # create patches only if scaled image is not already equal to patch size
         return pixel_values
 
     stride = int(patch_size * (1 - overlap_ratio))
 
-    # (B, C, W, H)
+    # (batch_size, num_channels, height, width)
     patches = torch.nn.functional.unfold(pixel_values, kernel_size=(patch_size, patch_size), stride=(stride, stride))
-    # patches.shape (B, patch_size**2 * C, num_patches)
+    # patches.shape (batch_size, patch_size**2 * num_channels, n_patches_per_batch)
     patches = patches.permute(2, 0, 1)
-    # patches.shape (num_patches, B, patch_size**2 * C)
-    patches = patches.reshape(-1, C, patch_size, patch_size)
-    # patches.shape (B * num_patches, C, patch_size, patch_size)
+    # patches.shape (n_patches_per_batch, batch_size, patch_size**2 * C)
+    patches = patches.reshape(-1, num_channels, patch_size, patch_size)
+    # patches.shape (n_patches, num_channels, patch_size, patch_size)
 
     return patches
 
 
-def reshape_feature(hidden_states, width, height):
+def reshape_feature(hidden_states: torch.Tensor) -> torch.Tensor:
     """Discard class token and reshape 1D feature map to a 2D grid."""
-    B, _, C = hidden_states.shape
-    # (B, WH+1, C)
+    n_samples, seq_len, hidden_size = hidden_states.shape
+    size = int(math.sqrt(seq_len))
+
+    # (n_samples, seq_len, hidden_size)
     hidden_states = hidden_states[:, 1:, :]  # remove class token
-    # (B, WH, C)
-    hidden_states = hidden_states.reshape(B, width, height, C)
-    # (B, W, H, C)
+    # (n_samples, seq_len, hidden_size)
+    hidden_states = hidden_states.reshape(n_samples, size, size, hidden_size)
+    # (n_samples, size, size, hideden_size)
     hidden_states = hidden_states.permute(0, 3, 1, 2)
-    # (B, C, W, H)
+    # (n_samples, hideden_size, size, size)
     return hidden_states
 
 
-def merge(patches, batch_size, merge_out_size):
-    """Recreates Batch from Patches."""
-    num_patches, num_channels, out_size, out_size = patches.shape
+def merge(patches: torch.Tensor, batch_size: int, merge_out_size: int) -> torch.Tensor:
+    n_patches, hidden_size, out_size, out_size = patches.shape
+    n_patches_per_batch = n_patches // batch_size
+    sqrt_n_patches_per_batch = int(math.sqrt(n_patches_per_batch))
+    new_out_size = sqrt_n_patches_per_batch * out_size
 
-    if num_patches == batch_size:
+    if n_patches == batch_size:
         # merge only if the patches were created from scaled image
         # patches are not created when scaled image size is equal to patch size
         return patches
 
-    box_size = math.ceil(math.sqrt(num_patches // batch_size))
-    """
-    merge_out_size = (box_size - 2) * (out_size - 2 * padding) + (2) * (out_size - padding)
-    padding = (merge_out_size - box_size * out_size) / (6 - 2 * box_size)
-    """
-    padding = (box_size * out_size - merge_out_size) // (2 * box_size - 2)
-
-    i = 0
-    boxes = []
-    for h in range(box_size):
-        boxes_in_row = []
-        for w in range(box_size):
-            box = patches[batch_size * i : batch_size * (i + 1)]
-
-            if h != 0:
-                # remove pad from height if box is not at top border
-                box = box[..., padding:, :]
-            if w != 0:
-                # remove pad from width if box is not at left border
-                box = box[..., :, padding:]
-            if h != box_size - 1:
-                # remove pad from height if box is not at bottom border
-                box = box[..., : box.shape[-2] - padding, :]
-            if w != box_size - 1:
-                # remove pad from width if box is not at right border
-                box = box[..., :, : box.shape[-1] - padding]
-
-            boxes_in_row.append(box)
-            i += 1
-
-        boxes_in_row = torch.cat(boxes_in_row, dim=-1)
-        boxes.append(boxes_in_row)
-
-    boxes = torch.cat(boxes, dim=-2)
-    return boxes
+    # calculate padding using the formula
+    # merge_out_size = (box_size - 2) * (out_size - 2 * padding) + (2) * (out_size - padding)
+    padding = (sqrt_n_patches_per_batch * out_size - merge_out_size) // (2 * sqrt_n_patches_per_batch - 2)
+
+    # patches.shape (n_patches, hidden_size, out_size, out_size)
+
+    merged = patches.reshape(n_patches_per_batch, batch_size, hidden_size, out_size, out_size)
+    # (n_patches_per_batch, batch_size, hidden_size, out_size, out_size)
+    merged = merged.permute(1, 2, 0, 3, 4)
+    # (batch_size, hidden_size, n_patches_per_batch, out_size, out_size)
+
+    merged = merged[:, :, : sqrt_n_patches_per_batch**2, :, :]
+    # (batch_size, hidden_size, n_patches_per_batch, out_size, out_size)
+
+    merged = merged.reshape(
+        batch_size, hidden_size, sqrt_n_patches_per_batch, sqrt_n_patches_per_batch, out_size, out_size
+    )
+    # (batch_size, hidden_size, sqrt_n_patches_per_batch, sqrt_n_patches_per_batch, out_size, out_size)
+    merged = merged.permute(0, 1, 2, 4, 3, 5)
+    # (batch_size, hidden_size, sqrt_n_patches_per_batch, out_size, sqrt_n_patches_per_batch, out_size)
+    merged = merged.reshape(batch_size, hidden_size, new_out_size, new_out_size)
+    # (batch_size, hidden_size, sqrt_n_patches_per_batch * out_size, sqrt_n_patches_per_batch * out_size)
+
+    if padding != 0:
+        padding_mask = torch.ones((new_out_size, new_out_size), dtype=torch.bool)
+        starting_index = torch.arange(start=out_size - padding, end=new_out_size - padding, step=out_size)
+        for index in starting_index:
+            padding_mask[index : index + padding * 2, :] = False
+            padding_mask[:, index : index + padding * 2] = False
+
+        merged = merged[:, :, padding_mask]
+        final_out_size = int(math.sqrt(merged.shape[-1]))
+        merged = merged.reshape(*merged.shape[:2], final_out_size, final_out_size)
+
+    return merged
 
 
 class DepthProEncoder(nn.Module):
@@ -968,49 +974,6 @@ def __init__(self, config: DepthProConfig):
         self.out_size = config.patch_size // config.patch_embeddings_size
         self.seq_len = self.out_size**2  # each patch is flattened
 
-        # config.scaled_images_ratios is sorted
-        if config.scaled_images_ratios != sorted(config.scaled_images_ratios):
-            raise ValueError(
-                f"Values in scaled_images_ratios={config.scaled_images_ratios} " "should be sorted from low to high"
-            )
-
-        # lowest image resolution is greator than the patch_size
-        if config.scaled_images_ratios[0] * config.image_size < config.patch_size:
-            raise ValueError(
-                "Image cannot be scaled to a size less than patch_size. "
-                f"Provide values in scaled_images_ratios={config.scaled_images_ratios} suitable "
-                f"to the given patch_size={config.patch_size}."
-            )
-
-        # patch_size should be a divisible by patch_embeddings_size
-        # else it raises an exception in DepthProViTPatchEmbeddings
-        if config.patch_size % config.patch_embeddings_size != 0:
-            raise ValueError(
-                f"patch_size={config.patch_size} should be divisible "
-                f"by patch_embeddings_size={config.patch_embeddings_size}."
-            )
-
-        # scaled_images_ratios, scaled_images_overlap_ratios, scaled_images_feature_dims are consistent
-        if not (
-            len(config.scaled_images_ratios)
-            == len(config.scaled_images_overlap_ratios)
-            == len(config.scaled_images_feature_dims)
-        ):
-            raise ValueError(
-                f"len(scaled_images_ratios)={len(config.scaled_images_ratios)} and "
-                f"len(scaled_images_overlap_ratios)={len(config.scaled_images_overlap_ratios)} and "
-                f"len(scaled_images_feature_dims)={len(config.scaled_images_feature_dims)}, "
-                f"should match in config."
-            )
-
-        # intermediate_hook_ids, intermediate_feature_dims are consistent
-        if not (len(config.intermediate_hook_ids) == len(config.intermediate_feature_dims)):
-            raise ValueError(
-                f"len(intermediate_hook_ids)={len(config.intermediate_hook_ids)} and "
-                f"len(intermediate_feature_dims)={len(config.intermediate_feature_dims)}, "
-                f"should match in config."
-            )
-
         # patch encoder
         self.patch_encoder = DepthProViT(config)
 
@@ -1048,23 +1011,30 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if pixel_values.dim() != 4:
-            raise ValueError("Input tensor must have shape (B, C, H, W).")
+            raise ValueError("Input tensor must have shape (batch_size, num_channels, height, width).")
 
-        B, C, H, W = pixel_values.shape
+        batch_size, num_channels, height, width = pixel_values.shape
+
+        if not (num_channels == self.config.num_channels):
+            raise ValueError(
+                f"Found {num_channels} channels in image, expected number of channels is {self.config.num_channels} from config."
+            )
 
-        if not (C == self.config.num_channels):
+        if min(self.scaled_images_ratios) * min(height, width) < self.config.patch_size:
             raise ValueError(
-                f"Found {C} channels in image, expected number of channels is {self.config.num_channels} from config."
+                f"Image size {height}x{width} is too small to be scaled "
+                f"with scaled_images_ratios={self.scaled_images_ratios} "
+                f"when patch_size={self.config.patch_size}."
             )
 
-        # pixel_values.shape (B, config.num_channels, config.image_size, config.image_size)
+        # pixel_values.shape (batch_size, num_channels, height, width)
 
         # STEP 1: create 3-level image
 
         scaled_images = []
         for ratio in self.scaled_images_ratios:
-            scaled_images.append(interpolate(pixel_values, ratio))
-            # (B, config.num_channels, config.image_size * ratio, config.image_size * ratio)
+            scaled_images.append(interpolate(pixel_values, scale_factor=ratio))
+            # (batch_size, num_channels, height*ratio, width*ratio)
 
         # STEP 2: create patches
 
@@ -1074,9 +1044,10 @@ def forward(
                 patch_size=self.config.patch_size,
                 overlap_ratio=self.scaled_images_overlap_ratios[i],
             )
-        scaled_images_num_patches = [len(i) for i in scaled_images]
+            # (n_patches_per_scaled_image[i], num_channels, patch_size, patch_size)
+        n_patches_per_scaled_image = [len(i) for i in scaled_images]
         patches = torch.cat(scaled_images[::-1], dim=0)  # -1 as patch encoder expects high res patches first
-        # (sum(scaled_images_num_patches), config.num_channels, config.patch_size, config.patch_size)
+        # (n_patches, num_channels, patch_size, patch_size)
 
         # STEP 3: apply patch and image encoder
 
@@ -1087,43 +1058,59 @@ def forward(
             # required for intermediate features
             output_hidden_states=self.n_intermediate_hooks or output_hidden_states,
             return_dict=True,
-            batch_size=B,
+            batch_size=batch_size,
         )
+        # patch_encodings.last_hidden_state (batch_size, n_patches/batch_size, seq_len, hidden_size)
+        # patch_encodings.hidden_states[i]  (batch_size, n_patches/batch_size, seq_len, hidden_size)
+        # patch_encodings.attentions[i]     (batch_size, n_patches/batch_size, num_heads, seq_len, seq_len)
+
         last_hidden_state = patch_encodings.last_hidden_state
+        # (batch_size, n_patches/batch_size, seq_len, hidden_size)
         last_hidden_state = batch_to_patch(last_hidden_state)
-        scaled_images_last_hidden_state = torch.split_with_sizes(last_hidden_state, scaled_images_num_patches[::-1])
+        # (n_patches, seq_len, hidden_size)
+        scaled_images_last_hidden_state = torch.split_with_sizes(last_hidden_state, n_patches_per_scaled_image[::-1])
+        # (n_patches_per_scaled_image[i], seq_len, hidden_size)
         scaled_images_last_hidden_state = scaled_images_last_hidden_state[::-1]
-        # -1 as patch encoder expects high res patches first
+        # (n_patches_per_scaled_image[i], seq_len, hidden_size)
+        # -1 (reverse list) as patch encoder expects high res patches first
 
         # scale the image to patch size for image_encoder
-        image_scaled_to_patch_size = nn.functional.interpolate(
+        image_scaled_to_patch_size = interpolate(
             pixel_values,
             size=(self.config.patch_size, self.config.patch_size),
-            mode="bilinear",
-            align_corners=False,
         )
         image_encodings = self.image_encoder(
             pixel_values=image_scaled_to_patch_size,
             head_mask=head_mask,
         )
+        # image_encodings.last_hidden_state (batch_size, seq_len, hidden_size)
+        # image_encodings.hidden_states[i]  (batch_size, seq_len, hidden_size)
+        # image_encodings.attentions[i]     (batch_size, num_heads, seq_len, seq_len)
 
         # STEP 4: get patch features (high_res, med_res, low_res) - (3-5) in diagram
 
+        exponent_value = int(math.log2(width / self.out_size))
+        base_height = height // 2**exponent_value
+        base_width = width // 2**exponent_value
+
         scaled_images_features = []
         for i in range(self.n_scaled_images):
             # a. extract hidden_state
             hidden_state = scaled_images_last_hidden_state[i]
-            # (scaled_images_num_patches[i], self.seq_len+1, config.hidden_size)
+            # (n_patches_per_scaled_image[i], seq_len, hidden_size)
 
             # b. reshape back to image like
-            features = reshape_feature(
-                hidden_state, self.out_size, self.out_size
-            )  # (scaled_images_num_patches[i], config.num_channels, self.out_size, self.out_size)
+            features = reshape_feature(hidden_state)
+            # (n_patches_per_scaled_image[i], hidden_size, out_size, out_size)
 
             # c. merge patches back together
             features = merge(
-                features, batch_size=B, merge_out_size=self.out_size * 2**i
-            )  # (B, config.hidden_size, self.out_size*2**i, self.out_size*2**i)
+                features, batch_size=batch_size, merge_out_size=self.out_size * 2**i
+            )  # (batch_size, hidden_size, out_size*2**i, out_size*2**i)
+
+            # d. interpolate patches to base size
+            features = interpolate(features, size=(base_height * 2**i, base_width * 2**i))
+            # (batch_size, hidden_size, base_height*2**i, base_width*2**i)
 
             scaled_images_features.append(features)
 
@@ -1138,46 +1125,54 @@ def forward(
             hidden_state = patch_encodings.hidden_states[layer_id]
             hidden_state = batch_to_patch(hidden_state)
             hidden_state = hidden_state[
-                : scaled_images_num_patches[-1]
-            ]  # num_patches to be of same length as highest resolution
-            # (scaled_images_num_patches[-1], self.seq_len+1, config.hidden_size)
+                : n_patches_per_scaled_image[-1]
+            ]  # number of patches to be of same length as highest resolution
+            # (n_patches_per_scaled_image[-1], seq_len, hidden_size)
 
             # b. reshape back to image like
-            features = reshape_feature(
-                hidden_state,
-                self.out_size,
-                self.out_size,
-            )  # (scaled_images_num_patches[-1], config.hidden_size, self.out_size, self.out_size)
+            features = reshape_feature(hidden_state)
+            # (n_patches_per_scaled_image[-1], hidden_size, out_size, out_size)
 
             # c. merge patches back together
             features = merge(
                 features,
-                batch_size=B,
+                batch_size=batch_size,
                 merge_out_size=self.out_size * 2 ** (self.n_scaled_images - 1),
-            )  # (B, config.hidden_size, self.out_size*2**(self.n_scaled_images-1), self.out_size*2**(self.n_scaled_images-1))
+            )  # (batch_size, hidden_size, out_size*2**(n_scaled_images-1), out_size*2**(n_scaled_images-1))
+
+            # d. interpolate patches to base size
+            features = interpolate(
+                features,
+                size=(base_height * 2 ** (self.n_scaled_images - 1), base_width * 2 ** (self.n_scaled_images - 1)),
+            )
+            # (batch_size, hidden_size, base_height*2**(n_scaled_images - 1), base_width*2**(n_scaled_images - 1))
 
             intermediate_features.append(features)
 
         # STEP 6: get image features - (6) in diagram
 
         # a. extract hidden_state
-        hidden_state = image_encodings.last_hidden_state  # (B, self.seq_len+1, config.hidden_size)
+        hidden_state = image_encodings.last_hidden_state  # (batch_size, seq_len, hidden_size)
 
         # b. reshape back to image like
-        image_features = reshape_feature(
-            hidden_state, self.out_size, self.out_size
-        )  # (B, config.hidden_size, self.out_size, self.out_size)
+        image_features = reshape_feature(hidden_state)
+        # (batch_size, hidden_size, out_size, out_size)
 
         # c. merge patches back together
         # no merge required for image_features as they are already in batches instead of patches
 
+        # d. interpolate patches to base size
+        image_features = interpolate(image_features, size=(base_height, base_width))
+        # (batch_size, hidden_size, base_height, base_width)
+
         # STEP 7: combine all features
         features = [
             image_features,
-            # (B, self.scaled_images_feature_dims[i], self.out_size*2**(i+1), self.out_size*2**(i+1))
+            # (batch_size, scaled_images_feature_dims[0], base_height*2, base_width*2)
             *scaled_images_features,
-            # (B, config.intermediate_feature_dims[i], self.out_size*2**(self.n_scaled_images+i+1), self.out_size*2**(self.n_scaled_images+i+1))
+            # (batch_size, scaled_images_feature_dims[i], base_height*2**(i+1), base_width*2**(i+1))
             *intermediate_features,
+            # (batch_size,  intermediate_feature_dims[i], base_height*2**(n_scaled_images+i+1), base_width*2**(n_scaled_images+i+1))
         ]
 
         # STEP 8: upsample features
@@ -1384,7 +1379,7 @@ def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
 
 
 # Taken from transformers.models.dpt.modeling_dpt.DPTFeatureFusionLayer
-# except it uses deconv annd skip_add
+# except it uses deconv and skip_add and needs no interpolation
 class DepthProFeatureFusionLayer(nn.Module):
     def __init__(self, config: DepthProConfig, use_deconv: bool = True):
         super().__init__()
@@ -1409,10 +1404,6 @@ def __init__(self, config: DepthProConfig, use_deconv: bool = True):
 
     def forward(self, hidden_state: torch.Tensor, residual: Optional[torch.Tensor] = None) -> torch.Tensor:
         if residual is not None:
-            if hidden_state.shape != residual.shape:
-                residual = nn.functional.interpolate(
-                    residual, size=(hidden_state.shape[2], hidden_state.shape[3]), mode="bilinear", align_corners=False
-                )
             hidden_state = self.skip_add.add(hidden_state, self.residual_layer1(residual))
 
         hidden_state = self.residual_layer2(hidden_state)
@@ -1473,12 +1464,6 @@ def __init__(self, config: DepthProConfig):
             nn.ReLU(True),
         )
 
-        if config.fusion_hidden_size // 2**config.num_fov_head_layers == 0:
-            raise ValueError(
-                f"fusion_hidden_size={config.fusion_hidden_size} should be consistent with config.num_fov_head_layers={config.num_fov_head_layers} "
-                "i.e config.fusion_hidden_size // 2**config.num_fov_head_layers > 0"
-            )
-
         # create initial head layers
         self.head = nn.Sequential()
         for i in range(config.num_fov_head_layers):
@@ -1507,16 +1492,14 @@ def forward(
         global_features: torch.Tensor,
         head_mask: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        B, C, W, H = pixel_values.shape
+        batch_size, num_channels, height, width = pixel_values.shape
 
         # follow the steps same as with image features in DepthProEncoder
         # except for the extra encoder_neck layer applied
 
-        image_scaled_to_patch_size = nn.functional.interpolate(
+        image_scaled_to_patch_size = interpolate(
             pixel_values,
             size=(self.config.patch_size, self.config.patch_size),
-            mode="bilinear",
-            align_corners=False,
         )
         encodings = self.encoder(
             image_scaled_to_patch_size,
@@ -1524,24 +1507,27 @@ def forward(
         )
 
         # a. extract hidden_state
-        hidden_state = encodings.last_hidden_state  # (B, self.seq_len+1, config.hidden_size)
+        hidden_state = encodings.last_hidden_state  # (batch_size, seq_len, hidden_size)
         # extra step
         hidden_state = self.encoder_neck(hidden_state)
-        # (B, self.fusion_hidden_size//2, self.out_size, self.out_size)
+        # (batch_size, seq_len, fusion_hidden_size//2)
 
         # b. reshape back to image like
-        fov_features = reshape_feature(
-            hidden_state, self.out_size, self.out_size
-        )  # (B, config.hidden_size, self.out_size, self.out_size)
+        fov_features = reshape_feature(hidden_state)
+        # (batch_size, fusion_hidden_size//2, out_size, out_size)
 
         # c. merge patches back together
         # no merge required for fov_features as they are already in batches instead of patches
 
+        # d. interpolate patches to base size
+        # skip; instead interpolate the global features
+
         global_features = self.global_neck(global_features)
+        global_features = interpolate(global_features, size=(self.out_size, self.out_size))
 
         fov_features = fov_features + global_features
         fov_output = self.head(fov_features)
-        fov_output = fov_output.reshape(B)
+        fov_output = fov_output.reshape(batch_size)
 
         return fov_output
 
diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py
index c6a22e90e4ac..ad17476c664d 100644
--- a/tests/models/depth_pro/test_modeling_depth_pro.py
+++ b/tests/models/depth_pro/test_modeling_depth_pro.py
@@ -366,15 +366,10 @@ def test_post_processing_depth_estimation(self):
         with torch.no_grad():
             outputs = model(**inputs)
 
-        predicted_depth = outputs.predicted_depth
-        fov = outputs.fov
-        target_size = [[image.height, image.width]] * len(predicted_depth)
-
         outputs = image_processor.post_process_depth_estimation(
-            predicted_depths=predicted_depth,
-            fovs=fov,
-            target_sizes=target_size,
+            outputs,
+            target_sizes=[[image.height, image.width]],
         )
-        predicted_depth = outputs["predicted_depth"][0]
+        predicted_depth = outputs[0]["predicted_depth"]
         expected_shape = torch.Size((image.height, image.width))
         self.assertTrue(predicted_depth.shape == expected_shape)

From 1563f06a2af7522aa37109eda67c2701af037092 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Tue, 24 Dec 2024 22:00:16 +0500
Subject: [PATCH 071/151] fix push_to_hub option in weights conversion

---
 .../models/depth_pro/convert_depth_pro_weights_to_hf.py        | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
index 15c063ca377a..201a4b4acc60 100644
--- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
+++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
@@ -252,8 +252,7 @@ def main():
     )
     parser.add_argument(
         "--push_to_hub",
-        default=True,
-        type=bool,
+        action=argparse.BooleanOptionalAction,
         help="Whether or not to push the converted model to the huggingface hub.",
     )
     parser.add_argument(

From e194ae45d2d383f4e954bc265f07fb318f68652e Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Tue, 24 Dec 2024 22:32:58 +0500
Subject: [PATCH 072/151] remove image_size in weights conversion

---
 .../models/depth_pro/convert_depth_pro_weights_to_hf.py          | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
index 201a4b4acc60..cca89f6a8b8c 100644
--- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
+++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
@@ -153,7 +153,6 @@ def write_model(
         attention_probs_dropout_prob=0.0,
         initializer_range=0.02,
         layer_norm_eps=1e-6,
-        image_size=1536,
         patch_size=384,
         num_channels=3,
         patch_embeddings_size=16,

From a4889f2c72053c071f01e03997b5819965d89a11 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Tue, 14 Jan 2025 21:14:28 +0500
Subject: [PATCH 073/151] major changes in the architecture - remove all
 DepthProViT modules and support different backbones using the AutoModel API -
 set default use_fov_model to False - validate parameters in configuration -
 update interpolate function: use "nearest" for faster computation - update
 reshape_feature function: remove all special tokens, possible from different
 backbones - update merge function: use padding from config instead of
 merge_out_size - remove patch_to_batch and batch_to_patch conversions for now
 - calculate out_size dynamically in the encoder - leave head_mask calculation
 to the backbone - fix bugs with merge - add more comments - update tests

---
 .../depth_pro/configuration_depth_pro.py      | 146 ++--
 .../models/depth_pro/modeling_depth_pro.py    | 697 ++----------------
 .../depth_pro/test_modeling_depth_pro.py      |  30 +-
 3 files changed, 182 insertions(+), 691 deletions(-)

diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py
index 206c01eff191..bb3585935b51 100644
--- a/src/transformers/models/depth_pro/configuration_depth_pro.py
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@@ -14,8 +14,12 @@
 # limitations under the License.
 """DepthPro model configuration"""
 
+import copy
+
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
+from ..auto.configuration_auto import CONFIG_MAPPING
+from ..dinov2.configuration_dinov2 import Dinov2Config
 
 
 logger = logging.get_logger(__name__)
@@ -33,40 +37,19 @@ class DepthProConfig(PretrainedConfig):
 
     Args:
         hidden_size (`int`, *optional*, defaults to 1024):
-            Dimensionality of the encoder layers and the pooler layer.
+            Dimensionality of the encoder layers. Should match hidden_size of backbone.
         fusion_hidden_size (`int`, *optional*, defaults to 256):
             The number of channels before fusion.
         num_hidden_layers (`int`, *optional*, defaults to 24):
             Number of hidden layers in the Transformer encoder.
         num_attention_heads (`int`, *optional*, defaults to 16):
             Number of attention heads for each attention layer in the Transformer encoder.
-        mlp_ratio (`int`, *optional*, defaults to 4):
-            Ratio of the hidden size of the MLPs relative to the `hidden_size`.
-        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
-            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"relu"`, `"selu"` and `"gelu_new"` are supported.
-        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
-            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
-            The epsilon used by the layer normalization layers.
         patch_size (`int`, *optional*, defaults to 384):
-            The size (resolution) of each patch.
+            The size (resolution) of each patch. This is also the image_size for backbone model.
         num_channels (`int`, *optional*, defaults to 3):
             The number of input channels.
-        patch_embeddings_size (`int`, *optional*, defaults to 16):
-            kernel_size and stride for convolution in PatchEmbeddings.
-        qkv_bias (`bool`, *optional*, defaults to `True`):
-            Whether to add a bias to the queries, keys and values.
-        layerscale_value (`float`, *optional*, defaults to 1.0):
-            Initial value to use for layer scale.
-        drop_path_rate (`float`, *optional*, defaults to 0.0):
-            Stochastic depth rate per sample (when applied in the main path of residual layers).
-        use_swiglu_ffn (`bool`, *optional*, defaults to `False`):
-            Whether to use the SwiGLU feedforward neural network.
         intermediate_hook_ids (`List[int]`, *optional*, defaults to `[11, 5]`):
             Indices of the intermediate hidden states from the patch encoder to use for fusion.
         intermediate_feature_dims (`List[int]`, *optional*, defaults to `[256, 256]`):
@@ -77,14 +60,21 @@ class DepthProConfig(PretrainedConfig):
             Overlap ratios between patches for each scaled image in `scaled_images_ratios`.
         scaled_images_feature_dims (`List[int]`, *optional*, defaults to `[1024, 1024, 512]`):
             Hidden state dimensions during upsampling for each scaled image in `scaled_images_ratios`.
+        merge_padding_value (`int`, *optional*, defaults to 3):
+            When merging smaller patches back to the image size, overlapping sections of this size are removed.
         use_batch_norm_in_fusion_residual (`bool`, *optional*, defaults to `False`):
             Whether to use batch normalization in the pre-activate residual units of the fusion blocks.
         use_bias_in_fusion_residual (`bool`, *optional*, defaults to `True`):
             Whether to use bias in the pre-activate residual units of the fusion blocks.
-        use_fov_model (`bool`, *optional*, defaults to `True`):
+        use_fov_model (`bool`, *optional*, defaults to `False`):
             Whether to use `DepthProFOVModel` to generate the field of view.
         num_fov_head_layers (`int`, *optional*, defaults to 2):
             Number of convolution layers in the head of `DepthProFOVModel`.
+        backbone_config (`Union[Dict[str, Any], PretrainedConfig]`, *optional*):
+            The configuration of the backbone model, which is loaded using the [`AutoModel`] API.
+            By default, Dinov2 model is used as backbone.
+        backbone_kwargs (`dict`, *optional*):
+            Keyword arguments to be passed to AutoModel when loading from the backbone_config.
 
     Example:
 
@@ -109,28 +99,21 @@ def __init__(
         fusion_hidden_size=256,
         num_hidden_layers=24,
         num_attention_heads=16,
-        mlp_ratio=4,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.0,
-        attention_probs_dropout_prob=0.0,
         initializer_range=0.02,
-        layer_norm_eps=1e-6,
         patch_size=384,
         num_channels=3,
-        patch_embeddings_size=16,
-        qkv_bias=True,
-        layerscale_value=1.0,
-        drop_path_rate=0.0,
-        use_swiglu_ffn=False,
         intermediate_hook_ids=[11, 5],
         intermediate_feature_dims=[256, 256],
         scaled_images_ratios=[0.25, 0.5, 1],
         scaled_images_overlap_ratios=[0.0, 0.5, 0.25],
         scaled_images_feature_dims=[1024, 1024, 512],
+        merge_padding_value=3,
         use_batch_norm_in_fusion_residual=False,
         use_bias_in_fusion_residual=True,
-        use_fov_model=True,
+        use_fov_model=False,
         num_fov_head_layers=2,
+        backbone_config=None,
+        backbone_kwargs=None,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -141,13 +124,6 @@ def __init__(
                 f"Values in scaled_images_ratios={scaled_images_ratios} " "should be sorted from low to high"
             )
 
-        # patch_size should be a divisible by patch_embeddings_size
-        # else it raises an exception in DepthProViTPatchEmbeddings
-        if patch_size % patch_embeddings_size != 0:
-            raise ValueError(
-                f"patch_size={patch_size} should be divisible " f"by patch_embeddings_size={patch_embeddings_size}."
-            )
-
         # scaled_images_ratios, scaled_images_overlap_ratios, scaled_images_feature_dims should be consistent
         if not (len(scaled_images_ratios) == len(scaled_images_overlap_ratios) == len(scaled_images_feature_dims)):
             raise ValueError(
@@ -176,19 +152,9 @@ def __init__(
         self.fusion_hidden_size = fusion_hidden_size
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
-        self.mlp_ratio = mlp_ratio
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
         self.initializer_range = initializer_range
-        self.layer_norm_eps = layer_norm_eps
         self.patch_size = patch_size
         self.num_channels = num_channels
-        self.patch_embeddings_size = patch_embeddings_size
-        self.qkv_bias = qkv_bias
-        self.layerscale_value = layerscale_value
-        self.drop_path_rate = drop_path_rate
-        self.use_swiglu_ffn = use_swiglu_ffn
         self.use_batch_norm_in_fusion_residual = use_batch_norm_in_fusion_residual
         self.use_bias_in_fusion_residual = use_bias_in_fusion_residual
         self.use_fov_model = use_fov_model
@@ -198,6 +164,82 @@ def __init__(
         self.scaled_images_ratios = scaled_images_ratios
         self.scaled_images_overlap_ratios = scaled_images_overlap_ratios
         self.scaled_images_feature_dims = scaled_images_feature_dims
+        self.merge_padding_value = merge_padding_value
+
+        self.backbone_config = self._create_backbone_config(backbone_config)
+        self.backbone_kwargs = {} if backbone_kwargs is None else backbone_kwargs
+
+    def _create_backbone_config(self, backbone_config=None):
+        """"""
+        # for compatibility between DepthPro and the backbone model
+        # make sure these parameters of backbone model are same as DepthPro
+        matching_config = [
+            "hidden_size",
+            "num_channels",
+            "num_hidden_layers",
+            "num_attention_heads",
+            "_attn_implementation",
+        ]
+        matching_config_dict = {k: getattr(self, k) for k in matching_config}
+
+        if backbone_config is None:
+            # use Dinov2 config by default
+            logger.info("Initializing the config with the default Dinov2 backbone.")
+            backbone_config = Dinov2Config(
+                patch_size=16,
+                image_size=self.patch_size,
+                **matching_config_dict,
+            )
+
+        elif isinstance(backbone_config, dict):
+            assert backbone_config.get("model_type") is not None
+            logger.info(f"Initializing the config with a {backbone_config.get('model_type')} backbone.")
+            backbone_config.update(
+                {
+                    "image_size": self.patch_size,
+                    **matching_config_dict,
+                }
+            )
+            config_class = CONFIG_MAPPING[backbone_config.get("model_type")]
+            backbone_config = config_class.from_dict(backbone_config)
+
+        elif isinstance(backbone_config, PretrainedConfig):
+            backbone_config = backbone_config
+
+        else:
+            raise ValueError(
+                f"backbone_config must be a dictionary or a `PretrainedConfig`, got {backbone_config.__class__}."
+            )
+
+        # validate the config compatibility between DepthPro and the backbone model
+        if self.patch_size != backbone_config.image_size:
+            # patches of input image are created of size patch_size x patch_size
+            # then these patches are given to the backbone model as input images
+            raise ValueError(
+                f"patch_size={self.patch_size} should be equal to backbone_config.image_size={backbone_config.image_size}."
+            )
+        for key in matching_config:
+            config_value = getattr(self, key)
+            backbone_config_value = getattr(backbone_config, key)
+            if config_value != backbone_config_value:
+                raise ValueError(
+                    f"{key}={config_value} should be equal to backbone_config.{key}={backbone_config_value}."
+                )
+
+        return backbone_config
+
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+
+        if output["backbone_config"] is not None:
+            output["backbone_config"] = self.backbone_config.to_dict()
+
+        output["model_type"] = self.__class__.model_type
+        return output
 
 
 __all__ = ["DepthProConfig"]
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 633d765b49f3..8c845a48fb16 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -16,15 +16,13 @@
 
 import math
 from dataclasses import dataclass
-from typing import List, Optional, Set, Tuple, Union
+from typing import List, Optional, Tuple, Union
 
 import torch
 from torch import nn
 
-from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -33,6 +31,7 @@
     replace_return_docstrings,
     torch_int,
 )
+from ..auto import AutoModel
 from .configuration_depth_pro import DepthProConfig
 
 
@@ -170,579 +169,6 @@ def batch_to_patch(data: torch.Tensor) -> torch.Tensor:
     return data
 
 
-# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2PatchEmbeddings with Dinov2->DepthProViT
-class DepthProViTPatchEmbeddings(nn.Module):
-    """
-    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
-    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
-    Transformer.
-    """
-
-    # Ignore copy
-    # addition of config parameter patch_embeddings_size
-    def __init__(self, config):
-        super().__init__()
-
-        self.config = config
-        self.in_channels = config.num_channels
-        self.out_channels = config.hidden_size
-        self.patch_embeddings_size = config.patch_embeddings_size
-        self.num_channels = config.num_channels
-
-        self.projection = nn.Conv2d(
-            self.in_channels,
-            self.out_channels,
-            kernel_size=(self.patch_embeddings_size, self.patch_embeddings_size),
-            stride=(self.patch_embeddings_size, self.patch_embeddings_size),
-        )
-
-    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
-        num_channels = pixel_values.shape[1]
-        if num_channels != self.num_channels:
-            raise ValueError(
-                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
-                f" Expected {self.num_channels} but got {num_channels}."
-            )
-        embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2)
-        return embeddings
-
-
-class DepthProViTEmbeddings(nn.Module):
-    """
-    Copied from transformers.models.dinov2.modeling_dinov2.Dinov2Embeddings
-    except antialias=True in interpolation and removal of mask_token
-    and enabling dynamic embeddings.
-    """
-
-    def __init__(self, config: DepthProConfig):
-        super().__init__()
-
-        self.config = config
-        self.seq_len = (config.patch_size // config.patch_embeddings_size) ** 2
-
-        self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
-        self.patch_embeddings = DepthProViTPatchEmbeddings(config)
-        self.position_embeddings = nn.Parameter(torch.zeros(1, self.seq_len + 1, config.hidden_size))
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
-        """
-        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
-        images. This method is also adapted to support torch.jit tracing and interpolation at torch.float32 precision.
-
-        Adapted from:
-        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
-        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
-        """
-
-        num_positions = embeddings.shape[1] - 1
-
-        # always interpolate when tracing to ensure the exported model works for dynamic input shapes
-        if not torch.jit.is_tracing() and self.seq_len == num_positions and height == width:
-            return self.position_embeddings
-
-        class_pos_embed = self.position_embeddings[:, :1]
-        patch_pos_embed = self.position_embeddings[:, 1:]
-
-        dim = embeddings.shape[-1]
-
-        new_height = height // self.config.patch_embeddings_size
-        new_width = width // self.config.patch_embeddings_size
-
-        patch_pos_embed_size = torch_int(patch_pos_embed.shape[1] ** 0.5)
-        patch_pos_embed = patch_pos_embed.reshape(1, patch_pos_embed_size, patch_pos_embed_size, dim)
-        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
-        target_dtype = patch_pos_embed.dtype
-
-        patch_pos_embed = nn.functional.interpolate(
-            patch_pos_embed.to(torch.float32),
-            size=(new_height, new_width),
-            mode="bicubic",
-            align_corners=False,
-            antialias=True,  # except for this, the class is same as transformers.models.dinov2.modeling_dinov2.DepthProViTPatchEmbeddings
-        ).to(dtype=target_dtype)
-
-        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
-
-        return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
-
-    def forward(
-        self,
-        pixel_values: torch.Tensor,
-        batch_size: Optional[int] = None,
-    ) -> torch.Tensor:
-        n, _, height, width = pixel_values.shape
-        target_dtype = self.patch_embeddings.projection.weight.dtype
-        embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype))
-
-        # add the [CLS] token to the embedded patch tokens
-        cls_tokens = self.cls_token.expand(n, -1, -1)
-        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
-
-        # add positional encoding to each token
-        embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
-
-        embeddings = self.dropout(embeddings)
-
-        if batch_size is not None:
-            embeddings = patch_to_batch(embeddings, batch_size)
-
-        return embeddings
-
-
-# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViTConfig->DepthProConfig, ViT->DepthProViT
-class DepthProViTSelfAttention(nn.Module):
-    def __init__(self, config: DepthProConfig) -> None:
-        super().__init__()
-        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
-            raise ValueError(
-                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
-                f"heads {config.num_attention_heads}."
-            )
-
-        self.num_attention_heads = config.num_attention_heads
-        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
-        self.all_head_size = self.num_attention_heads * self.attention_head_size
-
-        self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
-        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
-        self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
-
-        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
-
-    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
-        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
-        x = x.view(new_x_shape)
-        return x.permute(0, 2, 1, 3)
-
-    # Ignore copy
-    # addition of parameter batch_size
-    def forward(
-        self,
-        hidden_states,
-        head_mask: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-        batch_size: Optional[int] = None,
-    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
-        mixed_query_layer = self.query(hidden_states)
-
-        key_layer = self.transpose_for_scores(self.key(hidden_states))
-        value_layer = self.transpose_for_scores(self.value(hidden_states))
-        query_layer = self.transpose_for_scores(mixed_query_layer)
-
-        # Take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
-
-        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
-
-        # Normalize the attention scores to probabilities.
-        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
-
-        # This is actually dropping out entire tokens to attend to, which might
-        # seem a bit unusual, but is taken from the original Transformer paper.
-        attention_probs = self.dropout(attention_probs)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attention_probs = attention_probs * head_mask
-
-        if batch_size is not None:
-            attention_probs_batched = patch_to_batch(attention_probs, batch_size)
-            attention_probs_patched = batch_to_patch(attention_probs_batched)
-        else:
-            attention_probs_patched = attention_probs_batched = attention_probs
-
-        context_layer = torch.matmul(attention_probs_patched, value_layer)
-
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
-        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-        context_layer = context_layer.view(new_context_layer_shape)
-
-        outputs = (context_layer, attention_probs_batched) if output_attentions else (context_layer,)
-
-        return outputs
-
-
-# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2SdpaSelfAttention with Dinov2Config->DepthProConfig, Dinov2->DepthProViT
-class DepthProViTSdpaSelfAttention(DepthProViTSelfAttention):
-    def __init__(self, config: DepthProConfig) -> None:
-        super().__init__(config)
-        self.attention_probs_dropout_prob = config.attention_probs_dropout_prob
-
-    # Ignore copy
-    # addition of `batch_size`
-    def forward(
-        self,
-        hidden_states,
-        head_mask: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-        batch_size: Optional[int] = None,
-    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
-        if output_attentions:
-            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
-            logger.warning_once(
-                "DepthProViTModel is using DepthProViTSdpaSelfAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
-                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-            )
-            return super().forward(
-                hidden_states=hidden_states,
-                head_mask=head_mask,
-                output_attentions=output_attentions,
-                batch_size=batch_size,
-            )
-
-        mixed_query_layer = self.query(hidden_states)
-
-        key_layer = self.transpose_for_scores(self.key(hidden_states))
-        value_layer = self.transpose_for_scores(self.value(hidden_states))
-        query_layer = self.transpose_for_scores(mixed_query_layer)
-
-        context_layer = torch.nn.functional.scaled_dot_product_attention(
-            query_layer,
-            key_layer,
-            value_layer,
-            head_mask,
-            self.attention_probs_dropout_prob if self.training else 0.0,
-            is_causal=False,
-            scale=None,
-        )
-
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
-        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-        context_layer = context_layer.view(new_context_layer_shape)
-
-        return context_layer, None
-
-
-# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViTConfig->DepthProConfig, ViT->DepthProViT
-class DepthProViTSelfOutput(nn.Module):
-    """
-    The residual connection is defined in DepthProViTLayer instead of here (as is the case with other models), due to the
-    layernorm applied before each block.
-    """
-
-    def __init__(self, config: DepthProConfig) -> None:
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-
-        return hidden_states
-
-
-# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViTConfig->DepthProConfig, ViT->DepthProViT
-class DepthProViTAttention(nn.Module):
-    def __init__(self, config: DepthProConfig) -> None:
-        super().__init__()
-        self.attention = DepthProViTSelfAttention(config)
-        self.output = DepthProViTSelfOutput(config)
-        self.pruned_heads = set()
-
-    def prune_heads(self, heads: Set[int]) -> None:
-        if len(heads) == 0:
-            return
-        heads, index = find_pruneable_heads_and_indices(
-            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
-        )
-
-        # Prune linear layers
-        self.attention.query = prune_linear_layer(self.attention.query, index)
-        self.attention.key = prune_linear_layer(self.attention.key, index)
-        self.attention.value = prune_linear_layer(self.attention.value, index)
-        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
-
-        # Update hyper params and store pruned heads
-        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
-        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
-        self.pruned_heads = self.pruned_heads.union(heads)
-
-    # Ignore copy
-    # addition of `batch_size`
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        head_mask: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-        batch_size: Optional[int] = None,
-    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
-        self_outputs = self.attention(hidden_states, head_mask, output_attentions, batch_size)
-
-        attention_output = self.output(self_outputs[0], hidden_states)
-
-        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
-        return outputs
-
-
-# Copied from transformers.models.vit.modeling_vit.ViTSdpaAttention with ViTConfig->DepthProConfig, ViT->DepthProViT
-class DepthProViTSdpaAttention(DepthProViTAttention):
-    def __init__(self, config: DepthProConfig) -> None:
-        super().__init__(config)
-        self.attention = DepthProViTSdpaSelfAttention(config)
-
-
-# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2LayerScale with Dinov2Config->DepthProConfig, Dinov2->DepthProViT
-class DepthProViTLayerScale(nn.Module):
-    def __init__(self, config) -> None:
-        super().__init__()
-        self.lambda1 = nn.Parameter(config.layerscale_value * torch.ones(config.hidden_size))
-
-    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
-        return hidden_state * self.lambda1
-
-
-# Copied from transformers.models.beit.modeling_beit.drop_path
-def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
-    """
-    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
-
-    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
-    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
-    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
-    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
-    argument.
-    """
-    if drop_prob == 0.0 or not training:
-        return input
-    keep_prob = 1 - drop_prob
-    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
-    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
-    random_tensor.floor_()  # binarize
-    output = input.div(keep_prob) * random_tensor
-    return output
-
-
-# Copied from transformers.models.beit.modeling_beit.BeitDropPath
-class DepthProViTDropPath(nn.Module):
-    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
-
-    def __init__(self, drop_prob: Optional[float] = None) -> None:
-        super().__init__()
-        self.drop_prob = drop_prob
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        return drop_path(hidden_states, self.drop_prob, self.training)
-
-    def extra_repr(self) -> str:
-        return "p={}".format(self.drop_prob)
-
-
-# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2MLP with Dinov2->DepthPro
-class DepthProViTMLP(nn.Module):
-    def __init__(self, config) -> None:
-        super().__init__()
-        in_features = out_features = config.hidden_size
-        hidden_features = int(config.hidden_size * config.mlp_ratio)
-        self.fc1 = nn.Linear(in_features, hidden_features, bias=True)
-        if isinstance(config.hidden_act, str):
-            self.activation = ACT2FN[config.hidden_act]
-        else:
-            self.activation = config.hidden_act
-        self.fc2 = nn.Linear(hidden_features, out_features, bias=True)
-
-    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
-        hidden_state = self.fc1(hidden_state)
-        hidden_state = self.activation(hidden_state)
-        hidden_state = self.fc2(hidden_state)
-        return hidden_state
-
-
-# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2SwiGLUFFN with Dinov2->DepthPro
-class DepthProViTSwiGLUFFN(nn.Module):
-    def __init__(self, config) -> None:
-        super().__init__()
-        in_features = out_features = config.hidden_size
-        hidden_features = int(config.hidden_size * config.mlp_ratio)
-        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
-
-        self.weights_in = nn.Linear(in_features, 2 * hidden_features, bias=True)
-        self.weights_out = nn.Linear(hidden_features, out_features, bias=True)
-
-    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
-        hidden_state = self.weights_in(hidden_state)
-        x1, x2 = hidden_state.chunk(2, dim=-1)
-        hidden = nn.functional.silu(x1) * x2
-        return self.weights_out(hidden)
-
-
-DEPTHPROVIT_ATTENTION_CLASSES = {
-    "eager": DepthProViTAttention,
-    "sdpa": DepthProViTSdpaAttention,
-}
-
-
-# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2Layer with Dinov2Config->DepthProConfig, Dinov2->DepthProViT all-casing
-class DepthProViTLayer(nn.Module):
-    """This corresponds to the Block class in the original implementation."""
-
-    def __init__(self, config: DepthProConfig) -> None:
-        super().__init__()
-
-        self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.attention = DEPTHPROVIT_ATTENTION_CLASSES[config._attn_implementation](config)
-        self.layer_scale1 = DepthProViTLayerScale(config)
-        self.drop_path = DepthProViTDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
-
-        self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-
-        if config.use_swiglu_ffn:
-            self.mlp = DepthProViTSwiGLUFFN(config)
-        else:
-            self.mlp = DepthProViTMLP(config)
-        self.layer_scale2 = DepthProViTLayerScale(config)
-
-    # Ignore copy
-    # addition of `batch_size`
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        head_mask: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-        batch_size: Optional[int] = None,
-    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
-        if batch_size is not None:
-            hidden_states = batch_to_patch(hidden_states)
-
-        self_attention_outputs = self.attention(
-            self.norm1(hidden_states),  # in DepthProViT, layernorm is applied before self-attention
-            head_mask,
-            output_attentions=output_attentions,
-            batch_size=batch_size,
-        )
-        attention_output = self_attention_outputs[0]
-
-        attention_output = self.layer_scale1(attention_output)
-        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
-
-        # first residual connection
-        hidden_states = self.drop_path(attention_output) + hidden_states
-
-        # in DepthProViT, layernorm is also applied after self-attention
-        layer_output = self.norm2(hidden_states)
-        layer_output = self.mlp(layer_output)
-        layer_output = self.layer_scale2(layer_output)
-
-        # second residual connection
-        layer_output = self.drop_path(layer_output) + hidden_states
-
-        if batch_size is not None:
-            layer_output = patch_to_batch(layer_output, batch_size)
-
-        outputs = (layer_output,) + outputs
-
-        return outputs
-
-
-# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViTConfig->DepthProConfig, ViT->DepthProViT
-class DepthProViTEncoder(nn.Module):
-    def __init__(self, config: DepthProConfig) -> None:
-        super().__init__()
-        self.config = config
-        self.layer = nn.ModuleList([DepthProViTLayer(config) for _ in range(config.num_hidden_layers)])
-        self.gradient_checkpointing = False
-
-    # Ignore copy
-    # addition of `batch_size`
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        head_mask: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-        output_hidden_states: bool = False,
-        return_dict: bool = True,
-        batch_size: Optional[int] = None,
-    ) -> Union[tuple, BaseModelOutput]:
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attentions = () if output_attentions else None
-
-        for i, layer_module in enumerate(self.layer):
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
-            layer_head_mask = head_mask[i] if head_mask is not None else None
-
-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    layer_module.__call__,
-                    hidden_states,
-                    layer_head_mask,
-                    output_attentions,
-                    batch_size,
-                )
-            else:
-                layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions, batch_size)
-
-            hidden_states = layer_outputs[0]
-
-            if output_attentions:
-                all_self_attentions = all_self_attentions + (layer_outputs[1],)
-
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        if not return_dict:
-            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
-        return BaseModelOutput(
-            last_hidden_state=hidden_states,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
-        )
-
-
-class DepthProViT(nn.Module):
-    def __init__(self, config: DepthProConfig):
-        super().__init__()
-        self.config = config
-
-        self.embeddings = DepthProViTEmbeddings(config)
-        self.encoder = DepthProViTEncoder(config)
-
-        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-
-    def forward(
-        self,
-        pixel_values: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        batch_size: Optional[int] = None,
-    ) -> Union[Tuple, BaseModelOutput]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if pixel_values is None:
-            raise ValueError("You have to specify pixel_values")
-
-        embedding_output = self.embeddings(pixel_values, batch_size=batch_size)
-
-        encoder_outputs = self.encoder(
-            embedding_output,
-            head_mask=head_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            batch_size=batch_size,
-        )
-        sequence_output = encoder_outputs[0]
-        sequence_output = self.layernorm(sequence_output)
-
-        if not return_dict:
-            head_outputs = (sequence_output,)
-            return head_outputs + encoder_outputs[1:]
-
-        return BaseModelOutput(
-            last_hidden_state=sequence_output,
-            hidden_states=encoder_outputs.hidden_states,
-            attentions=encoder_outputs.attentions,
-        )
-
-
 class DepthProFeatureUpsample(nn.Module):
     def __init__(self, config: DepthProConfig):
         super().__init__()
@@ -861,14 +287,20 @@ def forward(self, features: List[torch.Tensor]) -> List[torch.Tensor]:
 
 
 def interpolate(
-    pixel_values: torch.Tensor, size: Optional[int] = None, scale_factor: Optional[List[float]] = None
+    pixel_values: torch.Tensor,
+    size: Optional[int] = None,
+    scale_factor: Optional[List[float]] = None,
+    mode: str = "bilinear",
+    align_corners: bool = False,
 ) -> torch.Tensor:
+    if mode == "nearest":
+        align_corners = None
     return nn.functional.interpolate(
         pixel_values,
         size=size,
         scale_factor=scale_factor,
-        mode="bilinear",
-        align_corners=False,
+        mode=mode,
+        align_corners=align_corners,
     )
 
 
@@ -899,16 +331,16 @@ def reshape_feature(hidden_states: torch.Tensor) -> torch.Tensor:
     size = int(math.sqrt(seq_len))
 
     # (n_samples, seq_len, hidden_size)
-    hidden_states = hidden_states[:, 1:, :]  # remove class token
+    hidden_states = hidden_states[:, -(size**2) :, :]  # remove mask tokens if there are any
     # (n_samples, seq_len, hidden_size)
     hidden_states = hidden_states.reshape(n_samples, size, size, hidden_size)
-    # (n_samples, size, size, hideden_size)
+    # (n_samples, size, size, hidden_size)
     hidden_states = hidden_states.permute(0, 3, 1, 2)
-    # (n_samples, hideden_size, size, size)
+    # (n_samples, hidden_size, size, size)
     return hidden_states
 
 
-def merge(patches: torch.Tensor, batch_size: int, merge_out_size: int) -> torch.Tensor:
+def merge(patches: torch.Tensor, batch_size: int, padding: int) -> torch.Tensor:
     n_patches, hidden_size, out_size, out_size = patches.shape
     n_patches_per_batch = n_patches // batch_size
     sqrt_n_patches_per_batch = int(math.sqrt(n_patches_per_batch))
@@ -919,9 +351,18 @@ def merge(patches: torch.Tensor, batch_size: int, merge_out_size: int) -> torch.
         # patches are not created when scaled image size is equal to patch size
         return patches
 
-    # calculate padding using the formula
-    # merge_out_size = (box_size - 2) * (out_size - 2 * padding) + (2) * (out_size - padding)
-    padding = (sqrt_n_patches_per_batch * out_size - merge_out_size) // (2 * sqrt_n_patches_per_batch - 2)
+    if n_patches_per_batch < 4:
+        # for each batch, atleast 4 small patches are required to
+        # recreate a large square patch from merging them and later padding is applied
+        # 3 x (8x8) patches becomes 1 x ( 8x8 ) patch (extra patch ignored, no padding)
+        # 4 x (8x8) patches becomes 1 x (16x16) patch (padding later)
+        # 5 x (8x8) patches becomes 1 x (16x16) patch (extra patch ignored, padding later)
+        # 9 x (8x8) patches becomes 1 x (24x24) patch (padding later)
+        # thus the following code only rearranges the patches and removes extra ones
+        padding = 0
+
+    # make sure padding is not large enough to remove more than half of the patch
+    padding = min(out_size // 4, padding)
 
     # patches.shape (n_patches, hidden_size, out_size, out_size)
 
@@ -942,7 +383,7 @@ def merge(patches: torch.Tensor, batch_size: int, merge_out_size: int) -> torch.
     merged = merged.reshape(batch_size, hidden_size, new_out_size, new_out_size)
     # (batch_size, hidden_size, sqrt_n_patches_per_batch * out_size, sqrt_n_patches_per_batch * out_size)
 
-    if padding != 0:
+    if padding > 0:
         padding_mask = torch.ones((new_out_size, new_out_size), dtype=torch.bool)
         starting_index = torch.arange(start=out_size - padding, end=new_out_size - padding, step=out_size)
         for index in starting_index:
@@ -968,17 +409,16 @@ def __init__(self, config: DepthProConfig):
         self.scaled_images_ratios = config.scaled_images_ratios
         self.scaled_images_overlap_ratios = config.scaled_images_overlap_ratios
         self.scaled_images_feature_dims = config.scaled_images_feature_dims
+        self.merge_padding_value = config.merge_padding_value
 
         self.n_scaled_images = len(self.scaled_images_ratios)
         self.n_intermediate_hooks = len(self.intermediate_hook_ids)
-        self.out_size = config.patch_size // config.patch_embeddings_size
-        self.seq_len = self.out_size**2  # each patch is flattened
 
         # patch encoder
-        self.patch_encoder = DepthProViT(config)
+        self.patch_encoder = AutoModel.from_config(config.backbone_config, **self.config.backbone_kwargs)
 
         # image encoder
-        self.image_encoder = DepthProViT(config)
+        self.image_encoder = AutoModel.from_config(config.backbone_config, **self.config.backbone_kwargs)
 
         # upsample features
         self.feature_upsample = DepthProFeatureUpsample(config)
@@ -1058,21 +498,18 @@ def forward(
             # required for intermediate features
             output_hidden_states=self.n_intermediate_hooks or output_hidden_states,
             return_dict=True,
-            batch_size=batch_size,
         )
         # patch_encodings.last_hidden_state (batch_size, n_patches/batch_size, seq_len, hidden_size)
         # patch_encodings.hidden_states[i]  (batch_size, n_patches/batch_size, seq_len, hidden_size)
         # patch_encodings.attentions[i]     (batch_size, n_patches/batch_size, num_heads, seq_len, seq_len)
 
         last_hidden_state = patch_encodings.last_hidden_state
-        # (batch_size, n_patches/batch_size, seq_len, hidden_size)
-        last_hidden_state = batch_to_patch(last_hidden_state)
         # (n_patches, seq_len, hidden_size)
         scaled_images_last_hidden_state = torch.split_with_sizes(last_hidden_state, n_patches_per_scaled_image[::-1])
         # (n_patches_per_scaled_image[i], seq_len, hidden_size)
         scaled_images_last_hidden_state = scaled_images_last_hidden_state[::-1]
         # (n_patches_per_scaled_image[i], seq_len, hidden_size)
-        # -1 (reverse list) as patch encoder expects high res patches first
+        # -1 (reverse list) as patch encoder returns high res patches first, we need low res first
 
         # scale the image to patch size for image_encoder
         image_scaled_to_patch_size = interpolate(
@@ -1082,17 +519,23 @@ def forward(
         image_encodings = self.image_encoder(
             pixel_values=image_scaled_to_patch_size,
             head_mask=head_mask,
+            # TODO: return hidden_states from patch_encodings instead of image_encodings
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
         )
         # image_encodings.last_hidden_state (batch_size, seq_len, hidden_size)
         # image_encodings.hidden_states[i]  (batch_size, seq_len, hidden_size)
         # image_encodings.attentions[i]     (batch_size, num_heads, seq_len, seq_len)
 
-        # STEP 4: get patch features (high_res, med_res, low_res) - (3-5) in diagram
-
-        exponent_value = int(math.log2(width / self.out_size))
+        # calculate base height and width
+        # base height and width are the dimensions of the lowest resolution features
+        out_size = int(math.sqrt(image_encodings.last_hidden_state.shape[1]))
+        exponent_value = int(math.log2(width / out_size))
         base_height = height // 2**exponent_value
         base_width = width // 2**exponent_value
 
+        # STEP 4: get patch features (high_res, med_res, low_res) - (3-5) in diagram
+
         scaled_images_features = []
         for i in range(self.n_scaled_images):
             # a. extract hidden_state
@@ -1105,12 +548,15 @@ def forward(
 
             # c. merge patches back together
             features = merge(
-                features, batch_size=batch_size, merge_out_size=self.out_size * 2**i
-            )  # (batch_size, hidden_size, out_size*2**i, out_size*2**i)
+                features,
+                batch_size=batch_size,
+                padding=torch_int(self.merge_padding_value * (1 / self.scaled_images_ratios[i])),
+            )  # (batch_size, hidden_size, merge_out_size, merge_out_size)
 
             # d. interpolate patches to base size
-            features = interpolate(features, size=(base_height * 2**i, base_width * 2**i))
-            # (batch_size, hidden_size, base_height*2**i, base_width*2**i)
+            features = interpolate(
+                features, size=(base_height * 2**i, base_width * 2**i), mode="nearest"
+            )  # (batch_size, hidden_size, base_height*2**i, base_width*2**i)
 
             scaled_images_features.append(features)
 
@@ -1123,7 +569,6 @@ def forward(
                 self.intermediate_hook_ids[i] + 1
             )  # +1 to correct index position as hidden_states contain embedding output as well
             hidden_state = patch_encodings.hidden_states[layer_id]
-            hidden_state = batch_to_patch(hidden_state)
             hidden_state = hidden_state[
                 : n_patches_per_scaled_image[-1]
             ]  # number of patches to be of same length as highest resolution
@@ -1137,15 +582,15 @@ def forward(
             features = merge(
                 features,
                 batch_size=batch_size,
-                merge_out_size=self.out_size * 2 ** (self.n_scaled_images - 1),
-            )  # (batch_size, hidden_size, out_size*2**(n_scaled_images-1), out_size*2**(n_scaled_images-1))
+                padding=torch_int(self.merge_padding_value * (1 / self.scaled_images_ratios[-1])),
+            )  # (batch_size, hidden_size, merge_out_size, merge_out_size)
 
             # d. interpolate patches to base size
             features = interpolate(
                 features,
                 size=(base_height * 2 ** (self.n_scaled_images - 1), base_width * 2 ** (self.n_scaled_images - 1)),
-            )
-            # (batch_size, hidden_size, base_height*2**(n_scaled_images - 1), base_width*2**(n_scaled_images - 1))
+                mode="nearest",
+            )  # (batch_size, hidden_size, base_height*2**(n_scaled_images - 1), base_width*2**(n_scaled_images - 1))
 
             intermediate_features.append(features)
 
@@ -1162,7 +607,7 @@ def forward(
         # no merge required for image_features as they are already in batches instead of patches
 
         # d. interpolate patches to base size
-        image_features = interpolate(image_features, size=(base_height, base_width))
+        image_features = interpolate(image_features, size=(base_height, base_width), mode="nearest")
         # (batch_size, hidden_size, base_height, base_width)
 
         # STEP 7: combine all features
@@ -1190,9 +635,13 @@ def forward(
 
         # STEP 11: return output
 
-        last_hidden_state = patch_encodings.last_hidden_state
-        hidden_states = patch_encodings.hidden_states if output_hidden_states else None
-        attentions = patch_encodings.attentions if output_attentions else None
+        # TODO: return hidden_states from patch_encodings instead of image_encodings
+        # last_hidden_state = patch_encodings.last_hidden_state
+        # hidden_states = patch_encodings.hidden_states if output_hidden_states else None
+        # attentions = patch_encodings.attentions if output_attentions else None
+        last_hidden_state = image_encodings.last_hidden_state
+        hidden_states = image_encodings.hidden_states
+        attentions = image_encodings.attentions
 
         if not return_dict:
             return tuple(v for v in [last_hidden_state, features, hidden_states, attentions] if v is not None)
@@ -1215,7 +664,6 @@ class DepthProPreTrainedModel(PreTrainedModel):
     base_model_prefix = "depth_pro"
     main_input_name = "pixel_values"
     supports_gradient_checkpointing = True
-    _no_split_modules = ["DepthProViTSwiGLUFFN"]
     _supports_sdpa = True
 
     def _init_weights(self, module):
@@ -1244,7 +692,9 @@ def __init__(self, config):
         self.post_init()
 
     def get_input_embeddings(self):
-        return self.encoder.patch_encoder.embeddings.patch_embeddings
+        # TODO: return hidden_states from patch_encodings instead of image_encodings
+        # return self.encoder.patch_encoder.embeddings.patch_embeddings
+        return self.encoder.image_encoder.embeddings.patch_embeddings
 
     def _prune_heads(self, heads_to_prune):
         """
@@ -1298,16 +748,9 @@ def forward(
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
-
         encodings = self.encoder(
             pixel_values,
-            head_mask,
+            head_mask=head_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
@@ -1378,7 +821,7 @@ def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
         return hidden_state + residual
 
 
-# Taken from transformers.models.dpt.modeling_dpt.DPTFeatureFusionLayer
+# Modified from transformers.models.dpt.modeling_dpt.DPTFeatureFusionLayer
 # except it uses deconv and skip_add and needs no interpolation
 class DepthProFeatureFusionLayer(nn.Module):
     def __init__(self, config: DepthProConfig, use_deconv: bool = True):
@@ -1414,7 +857,7 @@ def forward(self, hidden_state: torch.Tensor, residual: Optional[torch.Tensor] =
         return hidden_state
 
 
-# Take from transformers.models.dpt.modeling_dpt.DPTFeatureFusionStage with DPT->DepthPro
+# Modified from transformers.models.dpt.modeling_dpt.DPTFeatureFusionStage with DPT->DepthPro
 # with deconv and reversed layers
 class DepthProFeatureFusionStage(nn.Module):
     def __init__(self, config):
@@ -1455,9 +898,9 @@ def __init__(self, config: DepthProConfig):
         self.hidden_size = config.hidden_size
         self.fusion_hidden_size = config.fusion_hidden_size
 
-        self.out_size = config.patch_size // config.patch_embeddings_size
+        self.out_size = config.backbone_config.image_size // config.backbone_config.patch_size
 
-        self.encoder = DepthProViT(config)
+        self.encoder = AutoModel.from_config(config.backbone_config, **self.config.backbone_kwargs)
         self.encoder_neck = nn.Linear(self.hidden_size, self.fusion_hidden_size // 2)
         self.global_neck = nn.Sequential(
             nn.Conv2d(self.fusion_hidden_size, self.fusion_hidden_size // 2, kernel_size=3, stride=2, padding=1),
@@ -1520,10 +963,10 @@ def forward(
         # no merge required for fov_features as they are already in batches instead of patches
 
         # d. interpolate patches to base size
-        # skip; instead interpolate the global features
+        fov_features = interpolate(fov_features, size=(self.out_size, self.out_size), mode="nearest")
 
         global_features = self.global_neck(global_features)
-        global_features = interpolate(global_features, size=(self.out_size, self.out_size))
+        global_features = interpolate(global_features, size=(self.out_size, self.out_size), mode="nearest")
 
         fov_features = fov_features + global_features
         fov_output = self.head(fov_features)
diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py
index ad17476c664d..812f5d7e53e8 100644
--- a/tests/models/depth_pro/test_modeling_depth_pro.py
+++ b/tests/models/depth_pro/test_modeling_depth_pro.py
@@ -46,7 +46,6 @@ def __init__(
         batch_size=8,
         image_size=64,
         patch_size=8,
-        patch_embeddings_size=4,
         num_channels=3,
         is_training=True,
         use_labels=True,
@@ -64,13 +63,16 @@ def __init__(
         attention_probs_dropout_prob=0.1,
         initializer_range=0.02,
         use_fov_model=False,
+        backbone_config={
+            "model_type": "dinov2",
+            "patch_size": 4,
+        },
         num_labels=3,
     ):
         self.parent = parent
         self.batch_size = batch_size
         self.image_size = image_size
         self.patch_size = patch_size
-        self.patch_embeddings_size = patch_embeddings_size
         self.num_channels = num_channels
         self.is_training = is_training
         self.use_labels = use_labels
@@ -88,13 +90,15 @@ def __init__(
         self.attention_probs_dropout_prob = attention_probs_dropout_prob
         self.initializer_range = initializer_range
         self.use_fov_model = use_fov_model
+        self.backbone_config = backbone_config
         self.num_labels = num_labels
 
-        self.num_patches = (patch_size // patch_embeddings_size) ** 2
-        self.seq_length = (patch_size // patch_embeddings_size) ** 2 + 1  # we add 1 for the [CLS] token
+        # may be different for a backbone other than dinov2
+        self.out_size = patch_size // backbone_config["patch_size"]
+        self.seq_length = self.out_size**2 + 1  # we add 1 for the [CLS] token
 
         n_fusion_blocks = len(intermediate_hook_ids) + len(scaled_images_ratios)
-        self.expected_depth_size = 2 ** (n_fusion_blocks + 1) * patch_size // patch_embeddings_size
+        self.expected_depth_size = 2 ** (n_fusion_blocks + 1) * self.out_size
 
     def prepare_config_and_inputs(self):
         pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
@@ -109,9 +113,7 @@ def prepare_config_and_inputs(self):
 
     def get_config(self):
         return DepthProConfig(
-            image_size=self.image_size,
             patch_size=self.patch_size,
-            patch_embeddings_size=self.patch_embeddings_size,
             num_channels=self.num_channels,
             hidden_size=self.hidden_size,
             fusion_hidden_size=self.fusion_hidden_size,
@@ -127,6 +129,7 @@ def get_config(self):
             attention_probs_dropout_prob=self.attention_probs_dropout_prob,
             initializer_range=self.initializer_range,
             use_fov_model=self.use_fov_model,
+            backbone_config=self.backbone_config,
         )
 
     def create_and_check_model(self, config, pixel_values, labels):
@@ -134,10 +137,12 @@ def create_and_check_model(self, config, pixel_values, labels):
         model.to(torch_device)
         model.eval()
         result = model(pixel_values)
-        num_patches = result.last_hidden_state.shape[1]  # num_patches are created dynamically
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, num_patches, self.seq_length, self.hidden_size)
-        )
+        # TODO: return hidden_states from patch_encodings instead of image_encodings
+        # num_patches = result.last_hidden_state.shape[1]  # num_patches are created dynamically
+        # self.parent.assertEqual(
+        #     result.last_hidden_state.shape, (self.batch_size, num_patches, self.seq_length, self.hidden_size)
+        # )
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
     def create_and_check_for_depth_estimation(self, config, pixel_values, labels):
         config.num_labels = self.num_labels
@@ -344,7 +349,8 @@ def test_inference_depth_estimation(self):
 
         # verify the predicted depth
         n_fusion_blocks = len(config.intermediate_hook_ids) + len(config.scaled_images_ratios)
-        expected_depth_size = 2 ** (n_fusion_blocks + 1) * config.patch_size // config.patch_embeddings_size
+        out_size = config.backbone_config.image_size // config.backbone_config.patch_size
+        expected_depth_size = 2 ** (n_fusion_blocks + 1) * out_size
         expected_shape = torch.Size((1, expected_depth_size, expected_depth_size))
         self.assertEqual(predicted_depth.shape, expected_shape)
 

From 9e09a6fc346315a63420e7ab10bf3c9c75cfc883 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Tue, 14 Jan 2025 21:41:34 +0500
Subject: [PATCH 074/151] placeholder for unused config attributes

---
 src/transformers/models/depth_pro/modeling_depth_pro.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 8c845a48fb16..02f659acdaf5 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -411,6 +411,11 @@ def __init__(self, config: DepthProConfig):
         self.scaled_images_feature_dims = config.scaled_images_feature_dims
         self.merge_padding_value = config.merge_padding_value
 
+        # placeholder to avoid
+        # ValueError: The following configuration classes contain unused attributes in the corresponding modeling files
+        self.num_hidden_layers = config.num_hidden_layers
+        self.num_attention_heads = config.num_attention_heads
+
         self.n_scaled_images = len(self.scaled_images_ratios)
         self.n_intermediate_hooks = len(self.intermediate_hook_ids)
 

From bf159b26abfd7a3415ec9594bd36ef3b4c5a04a5 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Tue, 14 Jan 2025 21:52:10 +0500
Subject: [PATCH 075/151] improve docs amid review

---
 docs/source/en/model_doc/depth_pro.md | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/docs/source/en/model_doc/depth_pro.md b/docs/source/en/model_doc/depth_pro.md
index 9019547434af..2c1828e26de9 100644
--- a/docs/source/en/model_doc/depth_pro.md
+++ b/docs/source/en/model_doc/depth_pro.md
@@ -41,11 +41,10 @@ This model was contributed by [geetu040](https://github.com/geetu040). The origi
 from transformers import DepthProConfig, DepthProForDepthEstimation
 
 config = DepthProConfig()
-model = DPTForDepthEstimation(config=config)
+model = DepthProForDepthEstimation(config=config)
 ```
 
-- By default model takes an input image of size `1536`, this can be changed via config, however the model is compatible with images of different width and height.
-- Input image is scaled with different ratios, as specified in `scaled_images_ratios`, then each of the scaled image is patched to `patch_size` with an overlap ratio of `scaled_images_overlap_ratios`.
+- Input image is scaled with different ratios, as specified in `scaled_images_ratios`, and each of the scaled image is patched to `patch_size` with an overlap ratio of `scaled_images_overlap_ratios`.
 - These patches go through `DinoV2 (ViT)` based encoders and are reassembled via a `DPT` based decoder.
 - `DepthProForDepthEstimation` can also predict the `FOV (Field of View)` if `use_fov_model` is set to `True` in the config.
 - `DepthProImageProcessor` can be used for preprocessing the inputs and postprocessing the outputs. `DepthProImageProcessor.post_process_depth_estimation` interpolates the `predicted_depth` back to match the input image size.
@@ -72,7 +71,6 @@ SDPA is used by default for `torch>=2.1.1` when an implementation is available,
 ```py
 from transformers import DepthProForDepthEstimation
 model = DepthProForDepthEstimation.from_pretrained("geetu040/DepthPro", attn_implementation="sdpa", torch_dtype=torch.float16)
-...
 ```
 
 For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`).

From fb416877c045469f63667b3495138f81a8a020b3 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Tue, 14 Jan 2025 22:13:47 +0500
Subject: [PATCH 076/151] minor change in docs

---
 docs/source/en/model_doc/depth_pro.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/depth_pro.md b/docs/source/en/model_doc/depth_pro.md
index 2c1828e26de9..3850c500995a 100644
--- a/docs/source/en/model_doc/depth_pro.md
+++ b/docs/source/en/model_doc/depth_pro.md
@@ -20,7 +20,7 @@ rendered properly in your Markdown viewer.
 
 The DepthPro model was proposed in [Depth Pro: Sharp Monocular Metric Depth in Less Than a Second](https://arxiv.org/abs/2410.02073) by Aleksei Bochkovskii, Amaël Delaunoy, Hugo Germain, Marcel Santos, Yichao Zhou, Stephan R. Richter, Vladlen Koltun.
 
-It leverages a multi-scale [Vision Transformer (ViT)](vit) optimized for dense predictions. It downsamples an image at several scales. At each scale, it is split into patches, which are processed by a ViT-based [Dinov2](dinov2) patch encoder, with weights shared across scales. Patches are merged into feature maps, upsampled, and fused via a [DPT](dpt) like decoder.
+It leverages a multi-scale [Vision Transformer (ViT)](vit) optimized for dense predictions. It downsamples an image at several scales. At each scale, it is split into patches, which are processed by a ViT-based [Dinov2](dinov2) patch encoder, with weights shared across scales. Patches are merged into feature maps, upsampled, and fused via a [DPT](dpt)-like decoder.
 
 The abstract from the paper is the following:
 

From 7fbb53e3d5e8e339fbb8dd16c000555daa96e6ed Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 15 Jan 2025 10:35:21 +0500
Subject: [PATCH 077/151] further optimize merge

---
 .../models/depth_pro/modeling_depth_pro.py    | 33 ++++++++++++-------
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 02f659acdaf5..d5b29e9363f4 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -364,16 +364,14 @@ def merge(patches: torch.Tensor, batch_size: int, padding: int) -> torch.Tensor:
     # make sure padding is not large enough to remove more than half of the patch
     padding = min(out_size // 4, padding)
 
-    # patches.shape (n_patches, hidden_size, out_size, out_size)
+    # patches.shape: (n_patches, hidden_size, out_size, out_size)
 
     merged = patches.reshape(n_patches_per_batch, batch_size, hidden_size, out_size, out_size)
     # (n_patches_per_batch, batch_size, hidden_size, out_size, out_size)
     merged = merged.permute(1, 2, 0, 3, 4)
     # (batch_size, hidden_size, n_patches_per_batch, out_size, out_size)
-
     merged = merged[:, :, : sqrt_n_patches_per_batch**2, :, :]
     # (batch_size, hidden_size, n_patches_per_batch, out_size, out_size)
-
     merged = merged.reshape(
         batch_size, hidden_size, sqrt_n_patches_per_batch, sqrt_n_patches_per_batch, out_size, out_size
     )
@@ -383,16 +381,27 @@ def merge(patches: torch.Tensor, batch_size: int, padding: int) -> torch.Tensor:
     merged = merged.reshape(batch_size, hidden_size, new_out_size, new_out_size)
     # (batch_size, hidden_size, sqrt_n_patches_per_batch * out_size, sqrt_n_patches_per_batch * out_size)
 
+    # merged.shape: (batch_size, hidden_size, new_out_size, new_out_size)
+
     if padding > 0:
-        padding_mask = torch.ones((new_out_size, new_out_size), dtype=torch.bool)
-        starting_index = torch.arange(start=out_size - padding, end=new_out_size - padding, step=out_size)
-        for index in starting_index:
-            padding_mask[index : index + padding * 2, :] = False
-            padding_mask[:, index : index + padding * 2] = False
-
-        merged = merged[:, :, padding_mask]
-        final_out_size = int(math.sqrt(merged.shape[-1]))
-        merged = merged.reshape(*merged.shape[:2], final_out_size, final_out_size)
+        # let out_size = 8, new_out_size = 32, padding = 2
+        # each patch is separated by |
+        # and padding is applied to the merging edges of each patch
+        # 00 01 02 03 04 05 06 07 | 08 09 10 11 12 13 14 15 | 16 17 18 19 20 21 22 23 | 24 25 26 27 28 29 30 31
+        # 00 01 02 03 04 05 -- -- | -- -- 10 11 12 13 -- -- | -- -- 18 19 20 21 -- -- | -- -- 26 27 28 29 30 31
+        # starting_indexes = [2, 10, 18, 26]
+        # valid_indexes = [ 0,  1,  2,  3,  4,  5, 10, 11, 12, 13, 18, 19, 20, 21, 26, 27, 28, 29, 30, 31])
+
+        starting_indexes = torch.arange(start=padding, end=new_out_size , step=out_size)
+        valid_indexes = torch.concat([
+            torch.arange(padding),
+            *[
+                torch.arange(index, index + out_size - padding * 2) for index in starting_indexes
+            ],
+            torch.arange(new_out_size-padding, new_out_size),
+        ])
+        merged = merged[:, :, valid_indexes]
+        merged = merged[:, :, :, valid_indexes]
 
     return merged
 

From 558836c7e5b252942bfd732b9864d79d5dc52f1b Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 15 Jan 2025 10:39:24 +0500
Subject: [PATCH 078/151] fix formatting

---
 .../models/depth_pro/modeling_depth_pro.py       | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index d5b29e9363f4..e38c9c652e2b 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -392,14 +392,14 @@ def merge(patches: torch.Tensor, batch_size: int, padding: int) -> torch.Tensor:
         # starting_indexes = [2, 10, 18, 26]
         # valid_indexes = [ 0,  1,  2,  3,  4,  5, 10, 11, 12, 13, 18, 19, 20, 21, 26, 27, 28, 29, 30, 31])
 
-        starting_indexes = torch.arange(start=padding, end=new_out_size , step=out_size)
-        valid_indexes = torch.concat([
-            torch.arange(padding),
-            *[
-                torch.arange(index, index + out_size - padding * 2) for index in starting_indexes
-            ],
-            torch.arange(new_out_size-padding, new_out_size),
-        ])
+        starting_indexes = torch.arange(start=padding, end=new_out_size, step=out_size)
+        valid_indexes = torch.concat(
+            [
+                torch.arange(padding),
+                *[torch.arange(index, index + out_size - padding * 2) for index in starting_indexes],
+                torch.arange(new_out_size - padding, new_out_size),
+            ]
+        )
         merged = merged[:, :, valid_indexes]
         merged = merged[:, :, :, valid_indexes]
 

From ed77f78e34ce85b407b3308ae9d9d8a8333f5e6c Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Fri, 24 Jan 2025 07:28:34 +0500
Subject: [PATCH 079/151] remove unused patch/batch convertion functions

---
 .../models/depth_pro/modeling_depth_pro.py    | 20 -------------------
 1 file changed, 20 deletions(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index e38c9c652e2b..60079243066b 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -149,26 +149,6 @@ class DepthProDepthEstimatorOutput(ModelOutput):
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 
 
-def patch_to_batch(data: torch.Tensor, batch_size: int) -> torch.Tensor:
-    """
-    Converts tensor from shape:
-    (num_patches, seq_len, hidden_size) -> (batch_size, n_patches_per_batch, seq_len, hidden_size)
-    """
-    data = data.reshape(-1, batch_size, *data.shape[1:])
-    data = data.transpose(0, 1)
-    return data
-
-
-def batch_to_patch(data: torch.Tensor) -> torch.Tensor:
-    """
-    Converts tensor from shape:
-    (batch_size, n_patches_per_batch, seq_len, hidden_size) -> (num_patches, seq_len, hidden_size)
-    """
-    data = data.transpose(0, 1)
-    data = data.reshape(-1, *data.shape[2:])
-    return data
-
-
 class DepthProFeatureUpsample(nn.Module):
     def __init__(self, config: DepthProConfig):
         super().__init__()

From 5bc4b31d293a29290b2cce941084752fb4f7e7f9 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Fri, 24 Jan 2025 08:35:31 +0500
Subject: [PATCH 080/151] use original F.interpolate

---
 .../models/depth_pro/modeling_depth_pro.py    | 71 +++++++++++--------
 1 file changed, 42 insertions(+), 29 deletions(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 60079243066b..4a73459107c8 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -20,6 +20,7 @@
 
 import torch
 from torch import nn
+import torch.nn.functional as F
 
 from ...modeling_outputs import BaseModelOutput
 from ...modeling_utils import PreTrainedModel
@@ -266,24 +267,6 @@ def forward(self, features: List[torch.Tensor]) -> List[torch.Tensor]:
         return projected_features
 
 
-def interpolate(
-    pixel_values: torch.Tensor,
-    size: Optional[int] = None,
-    scale_factor: Optional[List[float]] = None,
-    mode: str = "bilinear",
-    align_corners: bool = False,
-) -> torch.Tensor:
-    if mode == "nearest":
-        align_corners = None
-    return nn.functional.interpolate(
-        pixel_values,
-        size=size,
-        scale_factor=scale_factor,
-        mode=mode,
-        align_corners=align_corners,
-    )
-
-
 def patch(pixel_values: torch.Tensor, patch_size: int, overlap_ratio: float) -> torch.Tensor:
     """Creates Patches from Batch."""
     batch_size, num_channels, height, width = pixel_values.shape
@@ -295,7 +278,7 @@ def patch(pixel_values: torch.Tensor, patch_size: int, overlap_ratio: float) ->
     stride = int(patch_size * (1 - overlap_ratio))
 
     # (batch_size, num_channels, height, width)
-    patches = torch.nn.functional.unfold(pixel_values, kernel_size=(patch_size, patch_size), stride=(stride, stride))
+    patches = F.unfold(pixel_values, kernel_size=(patch_size, patch_size), stride=(stride, stride))
     # patches.shape (batch_size, patch_size**2 * num_channels, n_patches_per_batch)
     patches = patches.permute(2, 0, 1)
     # patches.shape (n_patches_per_batch, batch_size, patch_size**2 * C)
@@ -467,7 +450,14 @@ def forward(
 
         scaled_images = []
         for ratio in self.scaled_images_ratios:
-            scaled_images.append(interpolate(pixel_values, scale_factor=ratio))
+            scaled_images.append(
+                F.interpolate(
+                    pixel_values,
+                    scale_factor=ratio,
+                    mode="bilinear",
+                    align_corners=False,
+                )
+            )
             # (batch_size, num_channels, height*ratio, width*ratio)
 
         # STEP 2: create patches
@@ -506,9 +496,11 @@ def forward(
         # -1 (reverse list) as patch encoder returns high res patches first, we need low res first
 
         # scale the image to patch size for image_encoder
-        image_scaled_to_patch_size = interpolate(
+        image_scaled_to_patch_size = F.interpolate(
             pixel_values,
             size=(self.config.patch_size, self.config.patch_size),
+            mode="bilinear",
+            align_corners=False,
         )
         image_encodings = self.image_encoder(
             pixel_values=image_scaled_to_patch_size,
@@ -548,8 +540,11 @@ def forward(
             )  # (batch_size, hidden_size, merge_out_size, merge_out_size)
 
             # d. interpolate patches to base size
-            features = interpolate(
-                features, size=(base_height * 2**i, base_width * 2**i), mode="nearest"
+            features = F.interpolate(
+                features,
+                size=(base_height * 2**i, base_width * 2**i),
+                mode="bilinear",
+                align_corners=False,
             )  # (batch_size, hidden_size, base_height*2**i, base_width*2**i)
 
             scaled_images_features.append(features)
@@ -580,10 +575,11 @@ def forward(
             )  # (batch_size, hidden_size, merge_out_size, merge_out_size)
 
             # d. interpolate patches to base size
-            features = interpolate(
+            features = F.interpolate(
                 features,
                 size=(base_height * 2 ** (self.n_scaled_images - 1), base_width * 2 ** (self.n_scaled_images - 1)),
-                mode="nearest",
+                mode="bilinear",
+                align_corners=False,
             )  # (batch_size, hidden_size, base_height*2**(n_scaled_images - 1), base_width*2**(n_scaled_images - 1))
 
             intermediate_features.append(features)
@@ -601,7 +597,12 @@ def forward(
         # no merge required for image_features as they are already in batches instead of patches
 
         # d. interpolate patches to base size
-        image_features = interpolate(image_features, size=(base_height, base_width), mode="nearest")
+        image_features = F.interpolate(
+            image_features,
+            size=(base_height, base_width),
+            mode="bilinear",
+            align_corners=False,
+        )
         # (batch_size, hidden_size, base_height, base_width)
 
         # STEP 7: combine all features
@@ -934,9 +935,11 @@ def forward(
         # follow the steps same as with image features in DepthProEncoder
         # except for the extra encoder_neck layer applied
 
-        image_scaled_to_patch_size = interpolate(
+        image_scaled_to_patch_size = F.interpolate(
             pixel_values,
             size=(self.config.patch_size, self.config.patch_size),
+            mode="bilinear",
+            align_corners=False,
         )
         encodings = self.encoder(
             image_scaled_to_patch_size,
@@ -957,10 +960,20 @@ def forward(
         # no merge required for fov_features as they are already in batches instead of patches
 
         # d. interpolate patches to base size
-        fov_features = interpolate(fov_features, size=(self.out_size, self.out_size), mode="nearest")
+        fov_features = F.interpolate(
+            fov_features,
+            size=(self.out_size, self.out_size),
+            mode="bilinear",
+            align_corners=False,
+        )
 
         global_features = self.global_neck(global_features)
-        global_features = interpolate(global_features, size=(self.out_size, self.out_size), mode="nearest")
+        global_features = F.interpolate(
+            global_features,
+            size=(self.out_size, self.out_size),
+            mode="bilinear",
+            align_corners=False,
+        )
 
         fov_features = fov_features + global_features
         fov_output = self.head(fov_features)

From 628ff09b2ccdaf5c190d3024b9b12604334a86d2 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Fri, 24 Jan 2025 08:52:40 +0500
Subject: [PATCH 081/151] improve function naming

---
 .../models/depth_pro/modeling_depth_pro.py             | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 4a73459107c8..f2933130b4f0 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -267,7 +267,7 @@ def forward(self, features: List[torch.Tensor]) -> List[torch.Tensor]:
         return projected_features
 
 
-def patch(pixel_values: torch.Tensor, patch_size: int, overlap_ratio: float) -> torch.Tensor:
+def split_to_patches(pixel_values: torch.Tensor, patch_size: int, overlap_ratio: float) -> torch.Tensor:
     """Creates Patches from Batch."""
     batch_size, num_channels, height, width = pixel_values.shape
 
@@ -303,7 +303,7 @@ def reshape_feature(hidden_states: torch.Tensor) -> torch.Tensor:
     return hidden_states
 
 
-def merge(patches: torch.Tensor, batch_size: int, padding: int) -> torch.Tensor:
+def merge_patches(patches: torch.Tensor, batch_size: int, padding: int) -> torch.Tensor:
     n_patches, hidden_size, out_size, out_size = patches.shape
     n_patches_per_batch = n_patches // batch_size
     sqrt_n_patches_per_batch = int(math.sqrt(n_patches_per_batch))
@@ -463,7 +463,7 @@ def forward(
         # STEP 2: create patches
 
         for i in range(self.n_scaled_images):
-            scaled_images[i] = patch(
+            scaled_images[i] = split_to_patches(
                 scaled_images[i],
                 patch_size=self.config.patch_size,
                 overlap_ratio=self.scaled_images_overlap_ratios[i],
@@ -533,7 +533,7 @@ def forward(
             # (n_patches_per_scaled_image[i], hidden_size, out_size, out_size)
 
             # c. merge patches back together
-            features = merge(
+            features = merge_patches(
                 features,
                 batch_size=batch_size,
                 padding=torch_int(self.merge_padding_value * (1 / self.scaled_images_ratios[i])),
@@ -568,7 +568,7 @@ def forward(
             # (n_patches_per_scaled_image[-1], hidden_size, out_size, out_size)
 
             # c. merge patches back together
-            features = merge(
+            features = merge_patches(
                 features,
                 batch_size=batch_size,
                 padding=torch_int(self.merge_padding_value * (1 / self.scaled_images_ratios[-1])),

From e2996b69afc442e5e6b4169a84b1655065befb1f Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Fri, 24 Jan 2025 09:17:08 +0500
Subject: [PATCH 082/151] minor chages - use torch_int instead of int - use
 proper for newly initialized tensors - use user provided return_dict for
 patch_encoder - use if-else block instead in self.use_fov_model

---
 .../models/depth_pro/modeling_depth_pro.py    | 48 +++++++++----------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index f2933130b4f0..c7f15d5687aa 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -275,7 +275,7 @@ def split_to_patches(pixel_values: torch.Tensor, patch_size: int, overlap_ratio:
         # create patches only if scaled image is not already equal to patch size
         return pixel_values
 
-    stride = int(patch_size * (1 - overlap_ratio))
+    stride = torch_int(patch_size * (1 - overlap_ratio))
 
     # (batch_size, num_channels, height, width)
     patches = F.unfold(pixel_values, kernel_size=(patch_size, patch_size), stride=(stride, stride))
@@ -291,7 +291,7 @@ def split_to_patches(pixel_values: torch.Tensor, patch_size: int, overlap_ratio:
 def reshape_feature(hidden_states: torch.Tensor) -> torch.Tensor:
     """Discard class token and reshape 1D feature map to a 2D grid."""
     n_samples, seq_len, hidden_size = hidden_states.shape
-    size = int(math.sqrt(seq_len))
+    size = torch_int(math.sqrt(seq_len))
 
     # (n_samples, seq_len, hidden_size)
     hidden_states = hidden_states[:, -(size**2) :, :]  # remove mask tokens if there are any
@@ -306,7 +306,7 @@ def reshape_feature(hidden_states: torch.Tensor) -> torch.Tensor:
 def merge_patches(patches: torch.Tensor, batch_size: int, padding: int) -> torch.Tensor:
     n_patches, hidden_size, out_size, out_size = patches.shape
     n_patches_per_batch = n_patches // batch_size
-    sqrt_n_patches_per_batch = int(math.sqrt(n_patches_per_batch))
+    sqrt_n_patches_per_batch = torch_int(math.sqrt(n_patches_per_batch))
     new_out_size = sqrt_n_patches_per_batch * out_size
 
     if n_patches == batch_size:
@@ -362,7 +362,7 @@ def merge_patches(patches: torch.Tensor, batch_size: int, padding: int) -> torch
                 *[torch.arange(index, index + out_size - padding * 2) for index in starting_indexes],
                 torch.arange(new_out_size - padding, new_out_size),
             ]
-        )
+        ).to(merged.device)
         merged = merged[:, :, valid_indexes]
         merged = merged[:, :, :, valid_indexes]
 
@@ -478,16 +478,16 @@ def forward(
         patch_encodings = self.patch_encoder(
             patches,
             head_mask=head_mask,
-            output_attentions=output_attentions,
             # required for intermediate features
-            output_hidden_states=self.n_intermediate_hooks or output_hidden_states,
-            return_dict=True,
+            output_hidden_states=self.n_intermediate_hooks > 0,
+            return_dict=return_dict,
         )
+        # TODO: these shapes are wrong
         # patch_encodings.last_hidden_state (batch_size, n_patches/batch_size, seq_len, hidden_size)
         # patch_encodings.hidden_states[i]  (batch_size, n_patches/batch_size, seq_len, hidden_size)
         # patch_encodings.attentions[i]     (batch_size, n_patches/batch_size, num_heads, seq_len, seq_len)
 
-        last_hidden_state = patch_encodings.last_hidden_state
+        last_hidden_state = patch_encodings[0]
         # (n_patches, seq_len, hidden_size)
         scaled_images_last_hidden_state = torch.split_with_sizes(last_hidden_state, n_patches_per_scaled_image[::-1])
         # (n_patches_per_scaled_image[i], seq_len, hidden_size)
@@ -515,8 +515,8 @@ def forward(
 
         # calculate base height and width
         # base height and width are the dimensions of the lowest resolution features
-        out_size = int(math.sqrt(image_encodings.last_hidden_state.shape[1]))
-        exponent_value = int(math.log2(width / out_size))
+        out_size = torch_int(math.sqrt(image_encodings.last_hidden_state.shape[1]))
+        exponent_value = torch_int(math.log2(width / out_size))
         base_height = height // 2**exponent_value
         base_width = width // 2**exponent_value
 
@@ -553,14 +553,14 @@ def forward(
 
         intermediate_features = []
         for i in range(self.n_intermediate_hooks):
-            # a. extract hidden_state
+            # a. extract intermediate hidden_state
+            # +1 to correct index position as hidden_states contain embedding output as well
             layer_id = (
                 self.intermediate_hook_ids[i] + 1
-            )  # +1 to correct index position as hidden_states contain embedding output as well
-            hidden_state = patch_encodings.hidden_states[layer_id]
-            hidden_state = hidden_state[
-                : n_patches_per_scaled_image[-1]
-            ]  # number of patches to be of same length as highest resolution
+            )
+            hidden_state = patch_encodings[2][layer_id]
+            # number of patches are to be of same length as highest resolution
+            hidden_state = hidden_state[:n_patches_per_scaled_image[-1]]
             # (n_patches_per_scaled_image[-1], seq_len, hidden_size)
 
             # b. reshape back to image like
@@ -917,7 +917,7 @@ def __init__(self, config: DepthProConfig):
             self.head.append(nn.ReLU(True))
         # calculate expected shapes to finally generate a scalar output from final head layer
         final_in_channels = math.ceil(self.fusion_hidden_size / 2 ** (config.num_fov_head_layers + 1))
-        final_kernal_size = int((self.out_size - 1) / 2**config.num_fov_head_layers + 1)
+        final_kernal_size = torch_int((self.out_size - 1) / 2**config.num_fov_head_layers + 1)
         self.head.append(
             nn.Conv2d(
                 in_channels=final_in_channels, out_channels=1, kernel_size=final_kernal_size, stride=1, padding=0
@@ -1109,16 +1109,16 @@ def forward(
         fused_hidden_states = self.fusion_stage(features)
         predicted_depth = self.head(fused_hidden_states[-1])
 
-        fov = (
-            self.fov_model(
+        if self.use_fov_model:
+            # frozen features from encoder are used
+            features_for_fov = features[0].detach()
+            fov = self.fov_model(
                 pixel_values=pixel_values,
-                # frozon features from encoder are used
-                global_features=features[0].detach(),
+                global_features=features_for_fov,
                 head_mask=head_mask,
             )
-            if self.use_fov_model
-            else None
-        )
+        else:
+            fov = None
 
         if not return_dict:
             outputs = [loss, predicted_depth, fov, depth_pro_outputs.hidden_states, depth_pro_outputs.attentions]

From 8cb5c7ac52f88e9b69877e5d75d4265e9296bbcd Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Fri, 24 Jan 2025 10:51:42 +0500
Subject: [PATCH 083/151] rearchitect upsample block for improved modularity

---
 .../models/depth_pro/modeling_depth_pro.py    | 115 ++++++++++--------
 1 file changed, 64 insertions(+), 51 deletions(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index c7f15d5687aa..8b6858da1317 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -150,56 +150,20 @@ class DepthProDepthEstimatorOutput(ModelOutput):
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 
 
-class DepthProFeatureUpsample(nn.Module):
-    def __init__(self, config: DepthProConfig):
-        super().__init__()
-        self.config = config
-
-        self.upsample_blocks = nn.ModuleList()
-
-        # for image_features
-        self.upsample_blocks.append(
-            self._create_upsample_block(
-                input_dims=config.hidden_size,
-                intermediate_dims=config.hidden_size,
-                output_dims=config.scaled_images_feature_dims[0],
-                n_upsample_layers=1,
-                use_proj=False,
-                bias=True,
-            )
-        )
-
-        # for scaled_images_features
-        for i, feature_dims in enumerate(config.scaled_images_feature_dims):
-            upsample_block = self._create_upsample_block(
-                input_dims=config.hidden_size,
-                intermediate_dims=feature_dims,
-                output_dims=feature_dims,
-                n_upsample_layers=1,
-            )
-            self.upsample_blocks.append(upsample_block)
-
-        # for intermediate_features
-        for i, feature_dims in enumerate(config.intermediate_feature_dims):
-            intermediate_dims = config.fusion_hidden_size if i == 0 else feature_dims
-            upsample_block = self._create_upsample_block(
-                input_dims=config.hidden_size,
-                intermediate_dims=intermediate_dims,
-                output_dims=feature_dims,
-                n_upsample_layers=2 + i,
-            )
-            self.upsample_blocks.append(upsample_block)
-
-    def _create_upsample_block(
+class DepthProFeatureUpsampleBlock(nn.Module):
+    def __init__(
         self,
+        config: DepthProConfig,
         input_dims: int,
         intermediate_dims: int,
         output_dims: int,
         n_upsample_layers: int,
         use_proj: bool = True,
         bias: bool = False,
-    ) -> nn.Module:
-        upsample_block = nn.Sequential()
+    ):
+        super().__init__()
+        self.config = config
+        self.layers = nn.Sequential()
 
         # create first projection layer
         if use_proj:
@@ -211,7 +175,7 @@ def _create_upsample_block(
                 padding=0,
                 bias=bias,
             )
-            upsample_block.append(proj)
+            self.layers.append(proj)
 
         # create following upsample layers
         for i in range(n_upsample_layers):
@@ -224,16 +188,65 @@ def _create_upsample_block(
                 padding=0,
                 bias=bias,
             )
-            upsample_block.append(layer)
+            self.layers.append(layer)
+
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        features = self.layers(features)
+        return features
+
+
+class DepthProFeatureUpsample(nn.Module):
+    def __init__(self, config: DepthProConfig):
+        super().__init__()
+        self.config = config
+        self.n_scaled_images = len(self.config.scaled_images_ratios)
+        self.n_intermediate_hooks = len(self.config.intermediate_hook_ids)
+        self.upsample_blocks = nn.ModuleDict()
+
+        # for image_features
+        self.upsample_blocks['image'] = DepthProFeatureUpsampleBlock(
+            config=config,
+            input_dims=config.hidden_size,
+            intermediate_dims=config.hidden_size,
+            output_dims=config.scaled_images_feature_dims[0],
+            n_upsample_layers=1,
+            use_proj=False,
+            bias=True,
+        )
+
+        # for scaled_images_features
+        for i, feature_dims in enumerate(config.scaled_images_feature_dims):
+            self.upsample_blocks[f'scaled_images_{i}'] = DepthProFeatureUpsampleBlock(
+                config=config,
+                input_dims=config.hidden_size,
+                intermediate_dims=feature_dims,
+                output_dims=feature_dims,
+                n_upsample_layers=1,
+            )
 
-        return upsample_block
+        # for intermediate_features
+        for i, feature_dims in enumerate(config.intermediate_feature_dims):
+            intermediate_dims = config.fusion_hidden_size if i == 0 else feature_dims
+            self.upsample_blocks[f'intermediate_{i}'] = DepthProFeatureUpsampleBlock(
+                config=config,
+                input_dims=config.hidden_size,
+                intermediate_dims=intermediate_dims,
+                output_dims=feature_dims,
+                n_upsample_layers=2 + i,
+            )
 
     def forward(self, features: List[torch.Tensor]) -> List[torch.Tensor]:
-        upsampled_features = []
-        for i, upsample_block in enumerate(self.upsample_blocks):
-            upsampled_feature = upsample_block(features[i])
-            upsampled_features.append(upsampled_feature)
-        return upsampled_features
+        features[0] = self.upsample_blocks['image'](features[0])
+
+        for i in range(self.n_scaled_images):
+            features[i + 1] = self.upsample_blocks[f'scaled_images_{i}'](features[i + 1])
+
+        for i in range(self.n_intermediate_hooks):
+            features[i + 1 + self.n_scaled_images] = self.upsample_blocks[f'intermediate_{i}'](
+                features[i + 1 + self.n_scaled_images]
+            )
+
+        return features
 
 
 class DepthProFeatureProjection(nn.Module):

From 1ba3a4a38ffc3566d1c7a39081951d2c497c9e8a Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Fri, 24 Jan 2025 11:17:45 +0500
Subject: [PATCH 084/151] update upsample keys in weight conversion

---
 .../convert_depth_pro_weights_to_hf.py        | 24 +++++++------------
 1 file changed, 8 insertions(+), 16 deletions(-)

diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
index cca89f6a8b8c..4019c88b327a 100644
--- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
+++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
@@ -58,22 +58,14 @@
     r"fov.encoder.1.(weight|bias)":                                             r"fov_model.encoder_neck.\1",
     r"fov.head.head.(\d+).(weight|bias)":                                       r"fov_model.head.\1.\2",
 
-    # upsamples (hard coded; regex is not very feasible here)
-    "encoder.upsample_latent0.0.weight":                                        "depth_pro.encoder.feature_upsample.upsample_blocks.5.0.weight",
-    "encoder.upsample_latent0.1.weight":                                        "depth_pro.encoder.feature_upsample.upsample_blocks.5.1.weight",
-    "encoder.upsample_latent0.2.weight":                                        "depth_pro.encoder.feature_upsample.upsample_blocks.5.2.weight",
-    "encoder.upsample_latent0.3.weight":                                        "depth_pro.encoder.feature_upsample.upsample_blocks.5.3.weight",
-    "encoder.upsample_latent1.0.weight":                                        "depth_pro.encoder.feature_upsample.upsample_blocks.4.0.weight",
-    "encoder.upsample_latent1.1.weight":                                        "depth_pro.encoder.feature_upsample.upsample_blocks.4.1.weight",
-    "encoder.upsample_latent1.2.weight":                                        "depth_pro.encoder.feature_upsample.upsample_blocks.4.2.weight",
-    "encoder.upsample0.0.weight":                                               "depth_pro.encoder.feature_upsample.upsample_blocks.3.0.weight",
-    "encoder.upsample0.1.weight":                                               "depth_pro.encoder.feature_upsample.upsample_blocks.3.1.weight",
-    "encoder.upsample1.0.weight":                                               "depth_pro.encoder.feature_upsample.upsample_blocks.2.0.weight",
-    "encoder.upsample1.1.weight":                                               "depth_pro.encoder.feature_upsample.upsample_blocks.2.1.weight",
-    "encoder.upsample2.0.weight":                                               "depth_pro.encoder.feature_upsample.upsample_blocks.1.0.weight",
-    "encoder.upsample2.1.weight":                                               "depth_pro.encoder.feature_upsample.upsample_blocks.1.1.weight",
-    "encoder.upsample_lowres.weight":                                           "depth_pro.encoder.feature_upsample.upsample_blocks.0.0.weight",
-    "encoder.upsample_lowres.bias":                                             "depth_pro.encoder.feature_upsample.upsample_blocks.0.0.bias",
+    # upsamples
+    r"encoder.upsample_lowres.(weight|bias)":                                   r"depth_pro.encoder.feature_upsample.upsample_blocks.image.layers.0.\1",
+    r"encoder.upsample_latent(\d+).(\d+).(weight|bias)": lambda match: (
+        f"depth_pro.encoder.feature_upsample.upsample_blocks.intermediate_{1-int(match.group(1))}.layers.{match.group(2)}.{match.group(3)}"
+    ),
+    r"encoder.upsample(\d+).(\d+).(weight|bias)": lambda match: (
+        f"depth_pro.encoder.feature_upsample.upsample_blocks.scaled_images_{2-int(match.group(1))}.layers.{match.group(2)}.{match.group(3)}"
+    ),
 
     # projections between encoder and fusion
     r"decoder.convs.(\d+).weight": lambda match: (

From 83706b85b6382a12cda1544674e6f36e90626403 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sat, 25 Jan 2025 22:34:53 +0500
Subject: [PATCH 085/151] improve padding in merge_patches

---
 .../models/depth_pro/modeling_depth_pro.py    | 32 ++++++++-----------
 1 file changed, 13 insertions(+), 19 deletions(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 8b6858da1317..a9570e399036 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -320,7 +320,6 @@ def merge_patches(patches: torch.Tensor, batch_size: int, padding: int) -> torch
     n_patches, hidden_size, out_size, out_size = patches.shape
     n_patches_per_batch = n_patches // batch_size
     sqrt_n_patches_per_batch = torch_int(math.sqrt(n_patches_per_batch))
-    new_out_size = sqrt_n_patches_per_batch * out_size
 
     if n_patches == batch_size:
         # merge only if the patches were created from scaled image
@@ -354,30 +353,25 @@ def merge_patches(patches: torch.Tensor, batch_size: int, padding: int) -> torch
     # (batch_size, hidden_size, sqrt_n_patches_per_batch, sqrt_n_patches_per_batch, out_size, out_size)
     merged = merged.permute(0, 1, 2, 4, 3, 5)
     # (batch_size, hidden_size, sqrt_n_patches_per_batch, out_size, sqrt_n_patches_per_batch, out_size)
-    merged = merged.reshape(batch_size, hidden_size, new_out_size, new_out_size)
-    # (batch_size, hidden_size, sqrt_n_patches_per_batch * out_size, sqrt_n_patches_per_batch * out_size)
-
-    # merged.shape: (batch_size, hidden_size, new_out_size, new_out_size)
 
+    # apply padding
     if padding > 0:
         # let out_size = 8, new_out_size = 32, padding = 2
-        # each patch is separated by |
+        # each patch is separated by "|"
         # and padding is applied to the merging edges of each patch
         # 00 01 02 03 04 05 06 07 | 08 09 10 11 12 13 14 15 | 16 17 18 19 20 21 22 23 | 24 25 26 27 28 29 30 31
         # 00 01 02 03 04 05 -- -- | -- -- 10 11 12 13 -- -- | -- -- 18 19 20 21 -- -- | -- -- 26 27 28 29 30 31
-        # starting_indexes = [2, 10, 18, 26]
-        # valid_indexes = [ 0,  1,  2,  3,  4,  5, 10, 11, 12, 13, 18, 19, 20, 21, 26, 27, 28, 29, 30, 31])
-
-        starting_indexes = torch.arange(start=padding, end=new_out_size, step=out_size)
-        valid_indexes = torch.concat(
-            [
-                torch.arange(padding),
-                *[torch.arange(index, index + out_size - padding * 2) for index in starting_indexes],
-                torch.arange(new_out_size - padding, new_out_size),
-            ]
-        ).to(merged.device)
-        merged = merged[:, :, valid_indexes]
-        merged = merged[:, :, :, valid_indexes]
+        mask = merged[0, 0]
+        mask[:, :,  :-1, out_size-padding:] = torch.nan
+        mask[:, :, 1:  , :padding] = torch.nan
+        mask[ :-1, out_size-padding:, :, :] = torch.nan
+        mask[1:  , :padding, :, :] = torch.nan
+        mask = ~mask.isnan()
+        merged = merged.masked_select(mask)
+
+    new_out_size = (sqrt_n_patches_per_batch - 2) * (out_size - 2 * padding) + 2 * (out_size - padding)
+    merged = merged.reshape(batch_size, hidden_size, new_out_size, new_out_size)
+    # (batch_size, hidden_size, new_out_size, new_out_size)
 
     return merged
 

From 004cdc247d9ce0b6bf435c4d6332f55ee7ccfec7 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sun, 26 Jan 2025 00:01:28 +0500
Subject: [PATCH 086/151] use double-loop for merge

---
 .../models/depth_pro/modeling_depth_pro.py    | 64 +++++++++++--------
 1 file changed, 36 insertions(+), 28 deletions(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index a9570e399036..bce4ce920b9b 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -320,6 +320,7 @@ def merge_patches(patches: torch.Tensor, batch_size: int, padding: int) -> torch
     n_patches, hidden_size, out_size, out_size = patches.shape
     n_patches_per_batch = n_patches // batch_size
     sqrt_n_patches_per_batch = torch_int(math.sqrt(n_patches_per_batch))
+    new_out_size = sqrt_n_patches_per_batch * out_size
 
     if n_patches == batch_size:
         # merge only if the patches were created from scaled image
@@ -339,39 +340,46 @@ def merge_patches(patches: torch.Tensor, batch_size: int, padding: int) -> torch
     # make sure padding is not large enough to remove more than half of the patch
     padding = min(out_size // 4, padding)
 
-    # patches.shape: (n_patches, hidden_size, out_size, out_size)
-
-    merged = patches.reshape(n_patches_per_batch, batch_size, hidden_size, out_size, out_size)
-    # (n_patches_per_batch, batch_size, hidden_size, out_size, out_size)
-    merged = merged.permute(1, 2, 0, 3, 4)
-    # (batch_size, hidden_size, n_patches_per_batch, out_size, out_size)
-    merged = merged[:, :, : sqrt_n_patches_per_batch**2, :, :]
-    # (batch_size, hidden_size, n_patches_per_batch, out_size, out_size)
-    merged = merged.reshape(
-        batch_size, hidden_size, sqrt_n_patches_per_batch, sqrt_n_patches_per_batch, out_size, out_size
-    )
-    # (batch_size, hidden_size, sqrt_n_patches_per_batch, sqrt_n_patches_per_batch, out_size, out_size)
-    merged = merged.permute(0, 1, 2, 4, 3, 5)
-    # (batch_size, hidden_size, sqrt_n_patches_per_batch, out_size, sqrt_n_patches_per_batch, out_size)
-
-    # apply padding
-    if padding > 0:
+    if padding == 0:
+        # faster when no padding is required
+        merged = patches.reshape(n_patches_per_batch, batch_size, hidden_size, out_size, out_size)
+        merged = merged.permute(1, 2, 0, 3, 4)
+        merged = merged[:, :, : sqrt_n_patches_per_batch**2, :, :]
+        merged = merged.reshape(
+            batch_size, hidden_size, sqrt_n_patches_per_batch, sqrt_n_patches_per_batch, out_size, out_size
+        )
+        merged = merged.permute(0, 1, 2, 4, 3, 5)
+        merged = merged.reshape(batch_size, hidden_size, new_out_size, new_out_size)
+    else:
+        # padding example:
         # let out_size = 8, new_out_size = 32, padding = 2
         # each patch is separated by "|"
         # and padding is applied to the merging edges of each patch
         # 00 01 02 03 04 05 06 07 | 08 09 10 11 12 13 14 15 | 16 17 18 19 20 21 22 23 | 24 25 26 27 28 29 30 31
         # 00 01 02 03 04 05 -- -- | -- -- 10 11 12 13 -- -- | -- -- 18 19 20 21 -- -- | -- -- 26 27 28 29 30 31
-        mask = merged[0, 0]
-        mask[:, :,  :-1, out_size-padding:] = torch.nan
-        mask[:, :, 1:  , :padding] = torch.nan
-        mask[ :-1, out_size-padding:, :, :] = torch.nan
-        mask[1:  , :padding, :, :] = torch.nan
-        mask = ~mask.isnan()
-        merged = merged.masked_select(mask)
-
-    new_out_size = (sqrt_n_patches_per_batch - 2) * (out_size - 2 * padding) + 2 * (out_size - padding)
-    merged = merged.reshape(batch_size, hidden_size, new_out_size, new_out_size)
-    # (batch_size, hidden_size, new_out_size, new_out_size)
+        i = 0
+        boxes = []
+        for h in range(sqrt_n_patches_per_batch):
+            boxes_in_row = []
+            for w in range(sqrt_n_patches_per_batch):
+                box = patches[batch_size * i : batch_size * (i + 1)]
+                if h != 0:
+                    # remove pad from height if box is not at top border
+                    box = box[..., padding:, :]
+                if w != 0:
+                    # remove pad from width if box is not at left border
+                    box = box[..., :, padding:]
+                if h != sqrt_n_patches_per_batch - 1:
+                    # remove pad from height if box is not at bottom border
+                    box = box[..., : box.shape[-2] - padding, :]
+                if w != sqrt_n_patches_per_batch - 1:
+                    # remove pad from width if box is not at right border
+                    box = box[..., :, : box.shape[-1] - padding]
+                boxes_in_row.append(box)
+                i += 1
+            boxes_in_row = torch.cat(boxes_in_row, dim=-1)
+            boxes.append(boxes_in_row)
+        merged = torch.cat(boxes, dim=-2)
 
     return merged
 

From 922b3de413d1b97c0598ee8e3cb4424016c797d0 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sun, 26 Jan 2025 00:37:14 +0500
Subject: [PATCH 087/151] update comments

---
 .../models/depth_pro/modeling_depth_pro.py    | 72 +++++--------------
 1 file changed, 16 insertions(+), 56 deletions(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index bce4ce920b9b..c86fe2b3959a 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -242,8 +242,8 @@ def forward(self, features: List[torch.Tensor]) -> List[torch.Tensor]:
             features[i + 1] = self.upsample_blocks[f'scaled_images_{i}'](features[i + 1])
 
         for i in range(self.n_intermediate_hooks):
-            features[i + 1 + self.n_scaled_images] = self.upsample_blocks[f'intermediate_{i}'](
-                features[i + 1 + self.n_scaled_images]
+            features[self.n_scaled_images + i + 1] = self.upsample_blocks[f'intermediate_{i}'](
+                features[self.n_scaled_images + i + 1]
             )
 
         return features
@@ -290,13 +290,9 @@ def split_to_patches(pixel_values: torch.Tensor, patch_size: int, overlap_ratio:
 
     stride = torch_int(patch_size * (1 - overlap_ratio))
 
-    # (batch_size, num_channels, height, width)
     patches = F.unfold(pixel_values, kernel_size=(patch_size, patch_size), stride=(stride, stride))
-    # patches.shape (batch_size, patch_size**2 * num_channels, n_patches_per_batch)
     patches = patches.permute(2, 0, 1)
-    # patches.shape (n_patches_per_batch, batch_size, patch_size**2 * C)
     patches = patches.reshape(-1, num_channels, patch_size, patch_size)
-    # patches.shape (n_patches, num_channels, patch_size, patch_size)
 
     return patches
 
@@ -306,13 +302,10 @@ def reshape_feature(hidden_states: torch.Tensor) -> torch.Tensor:
     n_samples, seq_len, hidden_size = hidden_states.shape
     size = torch_int(math.sqrt(seq_len))
 
-    # (n_samples, seq_len, hidden_size)
-    hidden_states = hidden_states[:, -(size**2) :, :]  # remove mask tokens if there are any
-    # (n_samples, seq_len, hidden_size)
+    hidden_states = hidden_states[:, -(size**2) :, :]  # remove special tokens if there are any
     hidden_states = hidden_states.reshape(n_samples, size, size, hidden_size)
-    # (n_samples, size, size, hidden_size)
     hidden_states = hidden_states.permute(0, 3, 1, 2)
-    # (n_samples, hidden_size, size, size)
+
     return hidden_states
 
 
@@ -459,8 +452,6 @@ def forward(
                 f"when patch_size={self.config.patch_size}."
             )
 
-        # pixel_values.shape (batch_size, num_channels, height, width)
-
         # STEP 1: create 3-level image
 
         scaled_images = []
@@ -473,7 +464,6 @@ def forward(
                     align_corners=False,
                 )
             )
-            # (batch_size, num_channels, height*ratio, width*ratio)
 
         # STEP 2: create patches
 
@@ -483,31 +473,22 @@ def forward(
                 patch_size=self.config.patch_size,
                 overlap_ratio=self.scaled_images_overlap_ratios[i],
             )
-            # (n_patches_per_scaled_image[i], num_channels, patch_size, patch_size)
         n_patches_per_scaled_image = [len(i) for i in scaled_images]
         patches = torch.cat(scaled_images[::-1], dim=0)  # -1 as patch encoder expects high res patches first
-        # (n_patches, num_channels, patch_size, patch_size)
 
         # STEP 3: apply patch and image encoder
 
         patch_encodings = self.patch_encoder(
+            # each patch is processed as a separate batch
             patches,
             head_mask=head_mask,
             # required for intermediate features
             output_hidden_states=self.n_intermediate_hooks > 0,
             return_dict=return_dict,
         )
-        # TODO: these shapes are wrong
-        # patch_encodings.last_hidden_state (batch_size, n_patches/batch_size, seq_len, hidden_size)
-        # patch_encodings.hidden_states[i]  (batch_size, n_patches/batch_size, seq_len, hidden_size)
-        # patch_encodings.attentions[i]     (batch_size, n_patches/batch_size, num_heads, seq_len, seq_len)
-
-        last_hidden_state = patch_encodings[0]
-        # (n_patches, seq_len, hidden_size)
-        scaled_images_last_hidden_state = torch.split_with_sizes(last_hidden_state, n_patches_per_scaled_image[::-1])
-        # (n_patches_per_scaled_image[i], seq_len, hidden_size)
+
+        scaled_images_last_hidden_state = torch.split_with_sizes(patch_encodings[0], n_patches_per_scaled_image[::-1])
         scaled_images_last_hidden_state = scaled_images_last_hidden_state[::-1]
-        # (n_patches_per_scaled_image[i], seq_len, hidden_size)
         # -1 (reverse list) as patch encoder returns high res patches first, we need low res first
 
         # scale the image to patch size for image_encoder
@@ -520,17 +501,13 @@ def forward(
         image_encodings = self.image_encoder(
             pixel_values=image_scaled_to_patch_size,
             head_mask=head_mask,
-            # TODO: return hidden_states from patch_encodings instead of image_encodings
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
         )
-        # image_encodings.last_hidden_state (batch_size, seq_len, hidden_size)
-        # image_encodings.hidden_states[i]  (batch_size, seq_len, hidden_size)
-        # image_encodings.attentions[i]     (batch_size, num_heads, seq_len, seq_len)
 
         # calculate base height and width
         # base height and width are the dimensions of the lowest resolution features
-        out_size = torch_int(math.sqrt(image_encodings.last_hidden_state.shape[1]))
+        out_size = torch_int(math.sqrt(image_encodings[0].shape[1]))
         exponent_value = torch_int(math.log2(width / out_size))
         base_height = height // 2**exponent_value
         base_width = width // 2**exponent_value
@@ -541,18 +518,16 @@ def forward(
         for i in range(self.n_scaled_images):
             # a. extract hidden_state
             hidden_state = scaled_images_last_hidden_state[i]
-            # (n_patches_per_scaled_image[i], seq_len, hidden_size)
 
             # b. reshape back to image like
             features = reshape_feature(hidden_state)
-            # (n_patches_per_scaled_image[i], hidden_size, out_size, out_size)
 
             # c. merge patches back together
             features = merge_patches(
                 features,
                 batch_size=batch_size,
                 padding=torch_int(self.merge_padding_value * (1 / self.scaled_images_ratios[i])),
-            )  # (batch_size, hidden_size, merge_out_size, merge_out_size)
+            )
 
             # d. interpolate patches to base size
             features = F.interpolate(
@@ -560,7 +535,7 @@ def forward(
                 size=(base_height * 2**i, base_width * 2**i),
                 mode="bilinear",
                 align_corners=False,
-            )  # (batch_size, hidden_size, base_height*2**i, base_width*2**i)
+            )
 
             scaled_images_features.append(features)
 
@@ -570,24 +545,20 @@ def forward(
         for i in range(self.n_intermediate_hooks):
             # a. extract intermediate hidden_state
             # +1 to correct index position as hidden_states contain embedding output as well
-            layer_id = (
-                self.intermediate_hook_ids[i] + 1
-            )
+            layer_id = self.intermediate_hook_ids[i] + 1
             hidden_state = patch_encodings[2][layer_id]
             # number of patches are to be of same length as highest resolution
             hidden_state = hidden_state[:n_patches_per_scaled_image[-1]]
-            # (n_patches_per_scaled_image[-1], seq_len, hidden_size)
 
             # b. reshape back to image like
             features = reshape_feature(hidden_state)
-            # (n_patches_per_scaled_image[-1], hidden_size, out_size, out_size)
 
             # c. merge patches back together
             features = merge_patches(
                 features,
                 batch_size=batch_size,
                 padding=torch_int(self.merge_padding_value * (1 / self.scaled_images_ratios[-1])),
-            )  # (batch_size, hidden_size, merge_out_size, merge_out_size)
+            )
 
             # d. interpolate patches to base size
             features = F.interpolate(
@@ -595,18 +566,17 @@ def forward(
                 size=(base_height * 2 ** (self.n_scaled_images - 1), base_width * 2 ** (self.n_scaled_images - 1)),
                 mode="bilinear",
                 align_corners=False,
-            )  # (batch_size, hidden_size, base_height*2**(n_scaled_images - 1), base_width*2**(n_scaled_images - 1))
+            )
 
             intermediate_features.append(features)
 
         # STEP 6: get image features - (6) in diagram
 
         # a. extract hidden_state
-        hidden_state = image_encodings.last_hidden_state  # (batch_size, seq_len, hidden_size)
+        hidden_state = image_encodings[0]
 
         # b. reshape back to image like
         image_features = reshape_feature(hidden_state)
-        # (batch_size, hidden_size, out_size, out_size)
 
         # c. merge patches back together
         # no merge required for image_features as they are already in batches instead of patches
@@ -618,16 +588,12 @@ def forward(
             mode="bilinear",
             align_corners=False,
         )
-        # (batch_size, hidden_size, base_height, base_width)
 
         # STEP 7: combine all features
         features = [
             image_features,
-            # (batch_size, scaled_images_feature_dims[0], base_height*2, base_width*2)
             *scaled_images_features,
-            # (batch_size, scaled_images_feature_dims[i], base_height*2**(i+1), base_width*2**(i+1))
             *intermediate_features,
-            # (batch_size,  intermediate_feature_dims[i], base_height*2**(n_scaled_images+i+1), base_width*2**(n_scaled_images+i+1))
         ]
 
         # STEP 8: upsample features
@@ -645,16 +611,12 @@ def forward(
 
         # STEP 11: return output
 
-        # TODO: return hidden_states from patch_encodings instead of image_encodings
-        # last_hidden_state = patch_encodings.last_hidden_state
-        # hidden_states = patch_encodings.hidden_states if output_hidden_states else None
-        # attentions = patch_encodings.attentions if output_attentions else None
         last_hidden_state = image_encodings.last_hidden_state
         hidden_states = image_encodings.hidden_states
         attentions = image_encodings.attentions
 
         if not return_dict:
-            return tuple(v for v in [last_hidden_state, features, hidden_states, attentions] if v is not None)
+            return (image_encodings[0], features) + image_encodings[2:] # ignore last_hidden_state and poooler output
 
         return DepthProOutput(
             last_hidden_state=last_hidden_state,
@@ -962,14 +924,12 @@ def forward(
         )
 
         # a. extract hidden_state
-        hidden_state = encodings.last_hidden_state  # (batch_size, seq_len, hidden_size)
+        hidden_state = encodings.last_hidden_state
         # extra step
         hidden_state = self.encoder_neck(hidden_state)
-        # (batch_size, seq_len, fusion_hidden_size//2)
 
         # b. reshape back to image like
         fov_features = reshape_feature(hidden_state)
-        # (batch_size, fusion_hidden_size//2, out_size, out_size)
 
         # c. merge patches back together
         # no merge required for fov_features as they are already in batches instead of patches

From 0f01b08e875d5be5dbfb4fbae532f3814aff194f Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sun, 26 Jan 2025 01:29:26 +0500
Subject: [PATCH 088/151] create feature_extractor, reduce some forward code

---
 .../models/depth_pro/modeling_depth_pro.py    | 124 +++++++-----------
 1 file changed, 49 insertions(+), 75 deletions(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index c86fe2b3959a..24071a8bb8a8 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -297,7 +297,7 @@ def split_to_patches(pixel_values: torch.Tensor, patch_size: int, overlap_ratio:
     return patches
 
 
-def reshape_feature(hidden_states: torch.Tensor) -> torch.Tensor:
+def reshape_features(hidden_states: torch.Tensor) -> torch.Tensor:
     """Discard class token and reshape 1D feature map to a 2D grid."""
     n_samples, seq_len, hidden_size = hidden_states.shape
     size = torch_int(math.sqrt(seq_len))
@@ -310,6 +310,7 @@ def reshape_feature(hidden_states: torch.Tensor) -> torch.Tensor:
 
 
 def merge_patches(patches: torch.Tensor, batch_size: int, padding: int) -> torch.Tensor:
+    """Merges smaller patches into image-like feature map."""
     n_patches, hidden_size, out_size, out_size = patches.shape
     n_patches_per_batch = n_patches // batch_size
     sqrt_n_patches_per_batch = torch_int(math.sqrt(n_patches_per_batch))
@@ -377,6 +378,35 @@ def merge_patches(patches: torch.Tensor, batch_size: int, padding: int) -> torch
     return merged
 
 
+def feature_extractor(
+    hidden_state: torch.Tensor,
+    batch_size: int,
+    padding: int,
+    output_size: Tuple[float, float]
+) -> torch.Tensor:
+    """Converts hidden_state to image like feature map."""
+
+    # reshape back to image like
+    features = reshape_features(hidden_state)
+
+    # merge patches back together
+    features = merge_patches(
+        features,
+        batch_size=batch_size,
+        padding=padding,
+    )
+
+    # interpolate patches to base size
+    features = F.interpolate(
+        features,
+        size=output_size,
+        mode="bilinear",
+        align_corners=False,
+    )
+
+    return features
+
+
 class DepthProEncoder(nn.Module):
     def __init__(self, config: DepthProConfig):
         super().__init__()
@@ -516,77 +546,34 @@ def forward(
 
         scaled_images_features = []
         for i in range(self.n_scaled_images):
-            # a. extract hidden_state
-            hidden_state = scaled_images_last_hidden_state[i]
-
-            # b. reshape back to image like
-            features = reshape_feature(hidden_state)
-
-            # c. merge patches back together
-            features = merge_patches(
-                features,
+            features = feature_extractor(
+                scaled_images_last_hidden_state[i],
                 batch_size=batch_size,
                 padding=torch_int(self.merge_padding_value * (1 / self.scaled_images_ratios[i])),
+                output_size=(base_height * 2**i, base_width * 2**i),
             )
-
-            # d. interpolate patches to base size
-            features = F.interpolate(
-                features,
-                size=(base_height * 2**i, base_width * 2**i),
-                mode="bilinear",
-                align_corners=False,
-            )
-
             scaled_images_features.append(features)
 
         # STEP 5: get intermediate features - (1-2) in diagram
 
         intermediate_features = []
         for i in range(self.n_intermediate_hooks):
-            # a. extract intermediate hidden_state
-            # +1 to correct index position as hidden_states contain embedding output as well
-            layer_id = self.intermediate_hook_ids[i] + 1
-            hidden_state = patch_encodings[2][layer_id]
-            # number of patches are to be of same length as highest resolution
-            hidden_state = hidden_state[:n_patches_per_scaled_image[-1]]
-
-            # b. reshape back to image like
-            features = reshape_feature(hidden_state)
-
-            # c. merge patches back together
-            features = merge_patches(
-                features,
+            features = feature_extractor(
+                # +1 to correct index position as hidden_states contain embedding output as well
+                patch_encodings[2][self.intermediate_hook_ids[i] + 1],
                 batch_size=batch_size,
                 padding=torch_int(self.merge_padding_value * (1 / self.scaled_images_ratios[-1])),
+                output_size=(base_height * 2 ** (self.n_scaled_images - 1), base_width * 2 ** (self.n_scaled_images - 1)),
             )
-
-            # d. interpolate patches to base size
-            features = F.interpolate(
-                features,
-                size=(base_height * 2 ** (self.n_scaled_images - 1), base_width * 2 ** (self.n_scaled_images - 1)),
-                mode="bilinear",
-                align_corners=False,
-            )
-
             intermediate_features.append(features)
 
         # STEP 6: get image features - (6) in diagram
 
-        # a. extract hidden_state
-        hidden_state = image_encodings[0]
-
-        # b. reshape back to image like
-        image_features = reshape_feature(hidden_state)
-
-        # c. merge patches back together
-        # no merge required for image_features as they are already in batches instead of patches
-
-        # d. interpolate patches to base size
-        image_features = F.interpolate(
-            image_features,
-            size=(base_height, base_width),
-            mode="bilinear",
-            align_corners=False,
+        image_features = feature_extractor(
+            image_encodings[0],
+            batch_size=batch_size,
+            padding=0,
+            output_size=(base_height, base_width),
         )
 
         # STEP 7: combine all features
@@ -909,9 +896,6 @@ def forward(
     ) -> torch.Tensor:
         batch_size, num_channels, height, width = pixel_values.shape
 
-        # follow the steps same as with image features in DepthProEncoder
-        # except for the extra encoder_neck layer applied
-
         image_scaled_to_patch_size = F.interpolate(
             pixel_values,
             size=(self.config.patch_size, self.config.patch_size),
@@ -922,24 +906,14 @@ def forward(
             image_scaled_to_patch_size,
             head_mask=head_mask,
         )
-
-        # a. extract hidden_state
-        hidden_state = encodings.last_hidden_state
-        # extra step
+        hidden_state = encodings[0]
         hidden_state = self.encoder_neck(hidden_state)
 
-        # b. reshape back to image like
-        fov_features = reshape_feature(hidden_state)
-
-        # c. merge patches back together
-        # no merge required for fov_features as they are already in batches instead of patches
-
-        # d. interpolate patches to base size
-        fov_features = F.interpolate(
-            fov_features,
-            size=(self.out_size, self.out_size),
-            mode="bilinear",
-            align_corners=False,
+        fov_features = feature_extractor(
+            hidden_state,
+            batch_size=batch_size,
+            padding=0,
+            output_size=(self.out_size, self.out_size),
         )
 
         global_features = self.global_neck(global_features)

From 4d871a706dd8e2e2171f5bfd53042576d65e4024 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sun, 26 Jan 2025 10:48:48 +0500
Subject: [PATCH 089/151] introduce config.use_mask_token in dinov2

---
 .../models/depth_pro/configuration_depth_pro.py       |  1 +
 .../models/dinov2/configuration_dinov2.py             |  4 ++++
 src/transformers/models/dinov2/modeling_dinov2.py     |  6 ++++--
 .../models/dinov2/modeling_flax_dinov2.py             | 11 ++++++-----
 4 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py
index bb3585935b51..9da0ce4b903f 100644
--- a/src/transformers/models/depth_pro/configuration_depth_pro.py
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@@ -188,6 +188,7 @@ def _create_backbone_config(self, backbone_config=None):
             backbone_config = Dinov2Config(
                 patch_size=16,
                 image_size=self.patch_size,
+                use_mask_token=False,
                 **matching_config_dict,
             )
 
diff --git a/src/transformers/models/dinov2/configuration_dinov2.py b/src/transformers/models/dinov2/configuration_dinov2.py
index dfc339f49da7..f4b29273a509 100644
--- a/src/transformers/models/dinov2/configuration_dinov2.py
+++ b/src/transformers/models/dinov2/configuration_dinov2.py
@@ -88,6 +88,8 @@ class Dinov2Config(BackboneConfigMixin, PretrainedConfig):
             Whether to reshape the feature maps to 4D tensors of shape `(batch_size, hidden_size, height, width)` in
             case the model is used as backbone. If `False`, the feature maps will be 3D tensors of shape `(batch_size,
             seq_len, hidden_size)`.
+        use_mask_token (`bool`, *optional*, defaults to `True`):
+            Whether to use mask_token in embeddings.
 
     Example:
 
@@ -128,6 +130,7 @@ def __init__(
         out_indices=None,
         apply_layernorm=True,
         reshape_hidden_states=True,
+        use_mask_token=True,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -154,6 +157,7 @@ def __init__(
         )
         self.apply_layernorm = apply_layernorm
         self.reshape_hidden_states = reshape_hidden_states
+        self.use_mask_token = use_mask_token
 
 
 class Dinov2OnnxConfig(OnnxConfig):
diff --git a/src/transformers/models/dinov2/modeling_dinov2.py b/src/transformers/models/dinov2/modeling_dinov2.py
index 71e0029d22d8..33ec1c054990 100644
--- a/src/transformers/models/dinov2/modeling_dinov2.py
+++ b/src/transformers/models/dinov2/modeling_dinov2.py
@@ -67,12 +67,14 @@ def __init__(self, config: Dinov2Config) -> None:
         super().__init__()
 
         self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size))
-        self.mask_token = nn.Parameter(torch.zeros(1, config.hidden_size))
+        if config.use_mask_token:
+            self.mask_token = nn.Parameter(torch.zeros(1, config.hidden_size))
         self.patch_embeddings = Dinov2PatchEmbeddings(config)
         num_patches = self.patch_embeddings.num_patches
         self.position_embeddings = nn.Parameter(torch.randn(1, num_patches + 1, config.hidden_size))
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.patch_size = config.patch_size
+        self.use_mask_token = config.use_mask_token
         self.config = config
 
     def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
@@ -120,7 +122,7 @@ def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.Te
         target_dtype = self.patch_embeddings.projection.weight.dtype
         embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype))
 
-        if bool_masked_pos is not None:
+        if bool_masked_pos is not None and self.use_mask_token:
             embeddings = torch.where(
                 bool_masked_pos.unsqueeze(-1), self.mask_token.to(embeddings.dtype).unsqueeze(0), embeddings
             )
diff --git a/src/transformers/models/dinov2/modeling_flax_dinov2.py b/src/transformers/models/dinov2/modeling_flax_dinov2.py
index 82d1bf95fa40..cf2a6e04c4ea 100644
--- a/src/transformers/models/dinov2/modeling_flax_dinov2.py
+++ b/src/transformers/models/dinov2/modeling_flax_dinov2.py
@@ -136,11 +136,12 @@ def setup(self):
             jax.nn.initializers.variance_scaling(self.config.initializer_range**2, "fan_in", "truncated_normal"),
             (1, 1, self.config.hidden_size),
         )
-        self.mask_token = self.param(
-            "mask_token",
-            jax.nn.initializers.variance_scaling(self.config.initializer_range**2, "fan_in", "truncated_normal"),
-            (1, self.config.hidden_size),
-        )
+        if self.config.use_mask_token:
+            self.mask_token = self.param(
+                "mask_token",
+                jax.nn.initializers.variance_scaling(self.config.initializer_range**2, "fan_in", "truncated_normal"),
+                (1, self.config.hidden_size),
+            )
         self.patch_embeddings = FlaxDinov2PatchEmbeddings(self.config, dtype=self.dtype)
         num_patches = self.patch_embeddings.num_patches
         self.position_embeddings = self.param(

From 85f7e3a79c7effb1f90d7ae8208b9baca3ba0ed3 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sun, 26 Jan 2025 11:08:42 +0500
Subject: [PATCH 090/151] minor fixes

---
 .../convert_depth_pro_weights_to_hf.py        | 29 +------------------
 .../models/depth_pro/modeling_depth_pro.py    | 11 +++----
 2 files changed, 5 insertions(+), 35 deletions(-)

diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
index 4019c88b327a..4b6545928a29 100644
--- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
+++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
@@ -134,35 +134,8 @@ def write_model(
 
     # create config
     config = DepthProConfig(
-        # this config is same as the default config and used for pre-trained weights
-        hidden_size=1024,
-        fusion_hidden_size=256,
-        num_hidden_layers=24,
-        num_attention_heads=16,
-        mlp_ratio=4,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.0,
-        attention_probs_dropout_prob=0.0,
-        initializer_range=0.02,
-        layer_norm_eps=1e-6,
-        patch_size=384,
-        num_channels=3,
-        patch_embeddings_size=16,
-        qkv_bias=True,
-        layerscale_value=1.0,
-        drop_path_rate=0.0,
-        use_swiglu_ffn=False,
-        apply_layernorm=True,
-        reshape_hidden_states=True,
-        intermediate_hook_ids=[11, 5],
-        intermediate_feature_dims=[256, 256],
-        scaled_images_ratios=[0.25, 0.5, 1],
-        scaled_images_overlap_ratios=[0.0, 0.5, 0.25],
-        scaled_images_feature_dims=[1024, 1024, 512],
-        use_batch_norm_in_fusion_residual=False,
-        use_bias_in_fusion_residual=True,
+        # use default config except for enabling fov model
         use_fov_model=True,
-        num_fov_head_layers=2,
     )
 
     # save config
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 24071a8bb8a8..70d06463c1bd 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -598,18 +598,14 @@ def forward(
 
         # STEP 11: return output
 
-        last_hidden_state = image_encodings.last_hidden_state
-        hidden_states = image_encodings.hidden_states
-        attentions = image_encodings.attentions
-
         if not return_dict:
             return (image_encodings[0], features) + image_encodings[2:] # ignore last_hidden_state and poooler output
 
         return DepthProOutput(
-            last_hidden_state=last_hidden_state,
+            last_hidden_state=image_encodings.last_hidden_state,
             features=features,
-            hidden_states=hidden_states,
-            attentions=attentions,
+            hidden_states=image_encodings.hidden_states,
+            attentions=image_encodings.attentions,
         )
 
 
@@ -624,6 +620,7 @@ class DepthProPreTrainedModel(PreTrainedModel):
     main_input_name = "pixel_values"
     supports_gradient_checkpointing = True
     _supports_sdpa = True
+    _no_split_modules = []
 
     def _init_weights(self, module):
         """Initialize the weights"""

From c0127d75d8d75c0baccb831d26badc1dd59c329e Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sun, 26 Jan 2025 16:01:02 +0500
Subject: [PATCH 091/151] minor fixes for onnx

---
 src/transformers/models/depth_pro/modeling_depth_pro.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 70d06463c1bd..f2521fddb970 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -300,7 +300,7 @@ def split_to_patches(pixel_values: torch.Tensor, patch_size: int, overlap_ratio:
 def reshape_features(hidden_states: torch.Tensor) -> torch.Tensor:
     """Discard class token and reshape 1D feature map to a 2D grid."""
     n_samples, seq_len, hidden_size = hidden_states.shape
-    size = torch_int(math.sqrt(seq_len))
+    size = torch_int(seq_len ** 0.5)
 
     hidden_states = hidden_states[:, -(size**2) :, :]  # remove special tokens if there are any
     hidden_states = hidden_states.reshape(n_samples, size, size, hidden_size)
@@ -313,7 +313,7 @@ def merge_patches(patches: torch.Tensor, batch_size: int, padding: int) -> torch
     """Merges smaller patches into image-like feature map."""
     n_patches, hidden_size, out_size, out_size = patches.shape
     n_patches_per_batch = n_patches // batch_size
-    sqrt_n_patches_per_batch = torch_int(math.sqrt(n_patches_per_batch))
+    sqrt_n_patches_per_batch = torch_int(n_patches_per_batch ** 0.5)
     new_out_size = sqrt_n_patches_per_batch * out_size
 
     if n_patches == batch_size:
@@ -537,7 +537,7 @@ def forward(
 
         # calculate base height and width
         # base height and width are the dimensions of the lowest resolution features
-        out_size = torch_int(math.sqrt(image_encodings[0].shape[1]))
+        out_size = torch_int(image_encodings[0].shape[1] ** 0.5)
         exponent_value = torch_int(math.log2(width / out_size))
         base_height = height // 2**exponent_value
         base_width = width // 2**exponent_value

From 18984594e728a5ddb877dff43ac0318c2693f5e5 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sun, 26 Jan 2025 16:01:14 +0500
Subject: [PATCH 092/151] update __init__ to latest format

---
 src/transformers/models/depth_pro/__init__.py | 59 +++----------------
 1 file changed, 8 insertions(+), 51 deletions(-)

diff --git a/src/transformers/models/depth_pro/__init__.py b/src/transformers/models/depth_pro/__init__.py
index 6fa380d64208..5968aae67b52 100644
--- a/src/transformers/models/depth_pro/__init__.py
+++ b/src/transformers/models/depth_pro/__init__.py
@@ -13,60 +13,17 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_torch_available, is_vision_available
-from ...utils import OptionalDependencyNotAvailable
-
-
-_import_structure = {"configuration_depth_pro": ["DepthProConfig"]}
-
-try:
-    if not is_vision_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["image_processing_depth_pro"] = ["DepthProImageProcessor"]
-    _import_structure["image_processing_depth_pro_fast"] = ["DepthProImageProcessorFast"]
-
-try:
-    if not is_torch_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["modeling_depth_pro"] = [
-        "DepthProForDepthEstimation",
-        "DepthProModel",
-        "DepthProPreTrainedModel",
-    ]
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
 
 
 if TYPE_CHECKING:
-    from .configuration_depth_pro import DepthProConfig
-
-    try:
-        if not is_vision_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .image_processing_depth_pro import DepthProImageProcessor
-        from .image_processing_depth_pro_fast import DepthProImageProcessorFast
-
-    try:
-        if not is_torch_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .modeling_depth_pro import (
-            DepthProForDepthEstimation,
-            DepthProModel,
-            DepthProPreTrainedModel,
-        )
-
-
+    from .configuration_depth_pro import *
+    from .image_processing_depth_pro import *
+    from .image_processing_depth_pro_fast import *
+    from .modeling_depth_pro import *
 else:
     import sys
 
-    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

From bcf1bf33c59301d4778c1f24d3e92a04eb736116 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sun, 26 Jan 2025 17:50:48 +0500
Subject: [PATCH 093/151] remove DepthProConfig.to_dict()

---
 .../models/depth_pro/configuration_depth_pro.py     | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py
index 9da0ce4b903f..b1783160a52b 100644
--- a/src/transformers/models/depth_pro/configuration_depth_pro.py
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@@ -229,18 +229,5 @@ def _create_backbone_config(self, backbone_config=None):
 
         return backbone_config
 
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-
-        if output["backbone_config"] is not None:
-            output["backbone_config"] = self.backbone_config.to_dict()
-
-        output["model_type"] = self.__class__.model_type
-        return output
-
 
 __all__ = ["DepthProConfig"]

From 09bffc3881334030bc08a42b25b3aa9e06c998df Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sun, 26 Jan 2025 21:00:09 +0500
Subject: [PATCH 094/151] major changes in backbone

---
 .../depth_pro/configuration_depth_pro.py      | 152 ++++++++----------
 .../models/depth_pro/modeling_depth_pro.py    |  30 ++--
 .../depth_pro/test_modeling_depth_pro.py      |  61 +++----
 3 files changed, 104 insertions(+), 139 deletions(-)

diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py
index b1783160a52b..52ff8e400f0e 100644
--- a/src/transformers/models/depth_pro/configuration_depth_pro.py
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@@ -14,12 +14,11 @@
 # limitations under the License.
 """DepthPro model configuration"""
 
-import copy
+from copy import deepcopy
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
-from ..auto.configuration_auto import CONFIG_MAPPING
-from ..dinov2.configuration_dinov2 import Dinov2Config
+from ..auto.configuration_auto import AutoConfig, CONFIG_MAPPING
 
 
 logger = logging.get_logger(__name__)
@@ -36,20 +35,12 @@ class DepthProConfig(PretrainedConfig):
     documentation from [`PretrainedConfig`] for more information.
 
     Args:
-        hidden_size (`int`, *optional*, defaults to 1024):
-            Dimensionality of the encoder layers. Should match hidden_size of backbone.
         fusion_hidden_size (`int`, *optional*, defaults to 256):
             The number of channels before fusion.
-        num_hidden_layers (`int`, *optional*, defaults to 24):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 16):
-            Number of attention heads for each attention layer in the Transformer encoder.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         patch_size (`int`, *optional*, defaults to 384):
             The size (resolution) of each patch. This is also the image_size for backbone model.
-        num_channels (`int`, *optional*, defaults to 3):
-            The number of input channels.
         intermediate_hook_ids (`List[int]`, *optional*, defaults to `[11, 5]`):
             Indices of the intermediate hidden states from the patch encoder to use for fusion.
         intermediate_feature_dims (`List[int]`, *optional*, defaults to `[256, 256]`):
@@ -70,11 +61,15 @@ class DepthProConfig(PretrainedConfig):
             Whether to use `DepthProFOVModel` to generate the field of view.
         num_fov_head_layers (`int`, *optional*, defaults to 2):
             Number of convolution layers in the head of `DepthProFOVModel`.
-        backbone_config (`Union[Dict[str, Any], PretrainedConfig]`, *optional*):
-            The configuration of the backbone model, which is loaded using the [`AutoModel`] API.
+        image_model_config (`Union[Dict[str, Any], PretrainedConfig]`, *optional*):
+            The configuration of the image encoder model, which is loaded using the [`AutoModel`] API.
+            By default, Dinov2 model is used as backbone.
+        patch_model_config (`Union[Dict[str, Any], PretrainedConfig]`, *optional*):
+            The configuration of the patch encoder model, which is loaded using the [`AutoModel`] API.
+            By default, Dinov2 model is used as backbone.
+        fov_model_config (`Union[Dict[str, Any], PretrainedConfig]`, *optional*):
+            The configuration of the fov encoder model, which is loaded using the [`AutoModel`] API.
             By default, Dinov2 model is used as backbone.
-        backbone_kwargs (`dict`, *optional*):
-            Keyword arguments to be passed to AutoModel when loading from the backbone_config.
 
     Example:
 
@@ -92,16 +87,17 @@ class DepthProConfig(PretrainedConfig):
     ```"""
 
     model_type = "depth_pro"
+    sub_configs = {
+        "image_model_config": AutoConfig,
+        "patch_model_config": AutoConfig,
+        "fov_model_config": AutoConfig
+    }
 
     def __init__(
         self,
-        hidden_size=1024,
         fusion_hidden_size=256,
-        num_hidden_layers=24,
-        num_attention_heads=16,
-        initializer_range=0.02,
         patch_size=384,
-        num_channels=3,
+        initializer_range=0.02,
         intermediate_hook_ids=[11, 5],
         intermediate_feature_dims=[256, 256],
         scaled_images_ratios=[0.25, 0.5, 1],
@@ -112,8 +108,9 @@ def __init__(
         use_bias_in_fusion_residual=True,
         use_fov_model=False,
         num_fov_head_layers=2,
-        backbone_config=None,
-        backbone_kwargs=None,
+        image_model_config=None,
+        patch_model_config=None,
+        fov_model_config=None,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -148,13 +145,9 @@ def __init__(
                 "i.e fusion_hidden_size // 2**num_fov_head_layers > 0"
             )
 
-        self.hidden_size = hidden_size
         self.fusion_hidden_size = fusion_hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.initializer_range = initializer_range
         self.patch_size = patch_size
-        self.num_channels = num_channels
+        self.initializer_range = initializer_range
         self.use_batch_norm_in_fusion_residual = use_batch_norm_in_fusion_residual
         self.use_bias_in_fusion_residual = use_bias_in_fusion_residual
         self.use_fov_model = use_fov_model
@@ -165,69 +158,52 @@ def __init__(
         self.scaled_images_overlap_ratios = scaled_images_overlap_ratios
         self.scaled_images_feature_dims = scaled_images_feature_dims
         self.merge_padding_value = merge_padding_value
-
-        self.backbone_config = self._create_backbone_config(backbone_config)
-        self.backbone_kwargs = {} if backbone_kwargs is None else backbone_kwargs
-
-    def _create_backbone_config(self, backbone_config=None):
-        """"""
-        # for compatibility between DepthPro and the backbone model
-        # make sure these parameters of backbone model are same as DepthPro
-        matching_config = [
-            "hidden_size",
-            "num_channels",
-            "num_hidden_layers",
-            "num_attention_heads",
-            "_attn_implementation",
-        ]
-        matching_config_dict = {k: getattr(self, k) for k in matching_config}
-
-        if backbone_config is None:
-            # use Dinov2 config by default
-            logger.info("Initializing the config with the default Dinov2 backbone.")
-            backbone_config = Dinov2Config(
-                patch_size=16,
-                image_size=self.patch_size,
-                use_mask_token=False,
-                **matching_config_dict,
-            )
-
-        elif isinstance(backbone_config, dict):
-            assert backbone_config.get("model_type") is not None
-            logger.info(f"Initializing the config with a {backbone_config.get('model_type')} backbone.")
-            backbone_config.update(
-                {
-                    "image_size": self.patch_size,
-                    **matching_config_dict,
-                }
-            )
-            config_class = CONFIG_MAPPING[backbone_config.get("model_type")]
-            backbone_config = config_class.from_dict(backbone_config)
-
-        elif isinstance(backbone_config, PretrainedConfig):
-            backbone_config = backbone_config
-
-        else:
-            raise ValueError(
-                f"backbone_config must be a dictionary or a `PretrainedConfig`, got {backbone_config.__class__}."
-            )
-
-        # validate the config compatibility between DepthPro and the backbone model
-        if self.patch_size != backbone_config.image_size:
-            # patches of input image are created of size patch_size x patch_size
-            # then these patches are given to the backbone model as input images
-            raise ValueError(
-                f"patch_size={self.patch_size} should be equal to backbone_config.image_size={backbone_config.image_size}."
-            )
-        for key in matching_config:
-            config_value = getattr(self, key)
-            backbone_config_value = getattr(backbone_config, key)
-            if config_value != backbone_config_value:
-                raise ValueError(
-                    f"{key}={config_value} should be equal to backbone_config.{key}={backbone_config_value}."
+        self.image_model_config = image_model_config
+        self.patch_model_config = patch_model_config
+        self.fov_model_config = fov_model_config
+
+        for sub_config_key in self.sub_configs.keys():
+            sub_config = getattr(self, sub_config_key)
+
+            if sub_config is None:
+                sub_config = CONFIG_MAPPING["dinov2"](image_size=patch_size)
+                logger.info(
+                    f"`{sub_config_key}` is `None`. Initializing `{sub_config_key}` with the `Dinov2Config` "
+                    f"with default values except `{sub_config_key}.image_size` is set to `config.patch_size`."
+                )
+            elif isinstance(sub_config, dict):
+                sub_config = deepcopy(sub_config)
+                if "model_type" not in sub_config:
+                    raise KeyError(
+                        f"The `model_type` key is missing in the `{sub_config_key}` dictionary. Please provide the model type."
+                    )
+                elif sub_config["model_type"] not in CONFIG_MAPPING:
+                    raise ValueError(
+                        f"The model type `{sub_config['model_type']}` in `{sub_config_key}` is not supported. Please provide a valid model type."
+                    )
+                image_size = sub_config.get("image_size")
+                if image_size != patch_size:
+                    logger.info(
+                        f"The `image_size` in `{sub_config_key}` is set to `{image_size}`, "
+                        f"but it does not match the required `patch_size` of `{patch_size}`. "
+                        f"Updating `image_size` to `{patch_size}` for consistency. "
+                        f"Ensure that `image_size` aligns with `patch_size` in the configuration."
+                    )
+                    sub_config.update({"image_size": patch_size})
+                sub_config = CONFIG_MAPPING[sub_config["model_type"]](**sub_config)
+            elif isinstance(sub_config, PretrainedConfig):
+                sub_config = sub_config
+                image_size = getattr(sub_config, "image_size", None)
+                if image_size != patch_size:
+                    raise ValueError(
+                        f"`config.{sub_config_key}.image_size={image_size}` should match `config.patch_size={patch_size}`."
+                    )
+            else:
+                raise TypeError(
+                    f"Invalid type for `sub_config`. Expected `PretrainedConfig`, `dict`, or `None`, but got {type(sub_config)}."
                 )
 
-        return backbone_config
+            setattr(self, sub_config_key, sub_config)
 
 
 __all__ = ["DepthProConfig"]
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index f2521fddb970..a6ede14784b0 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -206,8 +206,8 @@ def __init__(self, config: DepthProConfig):
         # for image_features
         self.upsample_blocks['image'] = DepthProFeatureUpsampleBlock(
             config=config,
-            input_dims=config.hidden_size,
-            intermediate_dims=config.hidden_size,
+            input_dims=config.image_model_config.hidden_size,
+            intermediate_dims=config.image_model_config.hidden_size,
             output_dims=config.scaled_images_feature_dims[0],
             n_upsample_layers=1,
             use_proj=False,
@@ -218,7 +218,7 @@ def __init__(self, config: DepthProConfig):
         for i, feature_dims in enumerate(config.scaled_images_feature_dims):
             self.upsample_blocks[f'scaled_images_{i}'] = DepthProFeatureUpsampleBlock(
                 config=config,
-                input_dims=config.hidden_size,
+                input_dims=config.patch_model_config.hidden_size,
                 intermediate_dims=feature_dims,
                 output_dims=feature_dims,
                 n_upsample_layers=1,
@@ -229,7 +229,7 @@ def __init__(self, config: DepthProConfig):
             intermediate_dims = config.fusion_hidden_size if i == 0 else feature_dims
             self.upsample_blocks[f'intermediate_{i}'] = DepthProFeatureUpsampleBlock(
                 config=config,
-                input_dims=config.hidden_size,
+                input_dims=config.patch_model_config.hidden_size,
                 intermediate_dims=intermediate_dims,
                 output_dims=feature_dims,
                 n_upsample_layers=2 + i,
@@ -411,7 +411,6 @@ class DepthProEncoder(nn.Module):
     def __init__(self, config: DepthProConfig):
         super().__init__()
         self.config = config
-        self.hidden_size = config.hidden_size
         self.fusion_hidden_size = config.fusion_hidden_size
 
         self.intermediate_hook_ids = config.intermediate_hook_ids
@@ -421,19 +420,14 @@ def __init__(self, config: DepthProConfig):
         self.scaled_images_feature_dims = config.scaled_images_feature_dims
         self.merge_padding_value = config.merge_padding_value
 
-        # placeholder to avoid
-        # ValueError: The following configuration classes contain unused attributes in the corresponding modeling files
-        self.num_hidden_layers = config.num_hidden_layers
-        self.num_attention_heads = config.num_attention_heads
-
         self.n_scaled_images = len(self.scaled_images_ratios)
         self.n_intermediate_hooks = len(self.intermediate_hook_ids)
 
         # patch encoder
-        self.patch_encoder = AutoModel.from_config(config.backbone_config, **self.config.backbone_kwargs)
+        self.patch_encoder = AutoModel.from_config(config.patch_model_config)
 
         # image encoder
-        self.image_encoder = AutoModel.from_config(config.backbone_config, **self.config.backbone_kwargs)
+        self.image_encoder = AutoModel.from_config(config.image_model_config)
 
         # upsample features
         self.feature_upsample = DepthProFeatureUpsample(config)
@@ -470,11 +464,6 @@ def forward(
 
         batch_size, num_channels, height, width = pixel_values.shape
 
-        if not (num_channels == self.config.num_channels):
-            raise ValueError(
-                f"Found {num_channels} channels in image, expected number of channels is {self.config.num_channels} from config."
-            )
-
         if min(self.scaled_images_ratios) * min(height, width) < self.config.patch_size:
             raise ValueError(
                 f"Image size {height}x{width} is too small to be scaled "
@@ -851,13 +840,12 @@ class DepthProFOVModel(nn.Module):
     def __init__(self, config: DepthProConfig):
         super().__init__()
         self.config = config
-        self.hidden_size = config.hidden_size
         self.fusion_hidden_size = config.fusion_hidden_size
 
-        self.out_size = config.backbone_config.image_size // config.backbone_config.patch_size
+        self.out_size = config.fov_model_config.image_size // config.fov_model_config.patch_size
 
-        self.encoder = AutoModel.from_config(config.backbone_config, **self.config.backbone_kwargs)
-        self.encoder_neck = nn.Linear(self.hidden_size, self.fusion_hidden_size // 2)
+        self.encoder = AutoModel.from_config(config.fov_model_config)
+        self.encoder_neck = nn.Linear(config.fov_model_config.hidden_size, self.fusion_hidden_size // 2)
         self.global_neck = nn.Sequential(
             nn.Conv2d(self.fusion_hidden_size, self.fusion_hidden_size // 2, kernel_size=3, stride=2, padding=1),
             nn.ReLU(True),
diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py
index 812f5d7e53e8..819f7f63c8ce 100644
--- a/tests/models/depth_pro/test_modeling_depth_pro.py
+++ b/tests/models/depth_pro/test_modeling_depth_pro.py
@@ -49,24 +49,35 @@ def __init__(
         num_channels=3,
         is_training=True,
         use_labels=True,
-        hidden_size=32,
         fusion_hidden_size=16,
-        intermediate_hook_ids=[0],
-        intermediate_feature_dims=[8],
+        intermediate_hook_ids=[1, 0],
+        intermediate_feature_dims=[10, 8],
         scaled_images_ratios=[0.5, 1.0],
         scaled_images_overlap_ratios=[0.0, 0.2],
         scaled_images_feature_dims=[12, 12],
-        num_hidden_layers=1,
-        num_attention_heads=4,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
         initializer_range=0.02,
         use_fov_model=False,
-        backbone_config={
+        image_model_config={
             "model_type": "dinov2",
+            "num_hidden_layers": 2,
+            "hidden_size": 16,
+            "num_attention_heads": 1,
             "patch_size": 4,
         },
+        patch_model_config={
+            "model_type": "vit",
+            "num_hidden_layers": 2,
+            "hidden_size": 24,
+            "num_attention_heads": 2,
+            "patch_size": 6,
+        },
+        fov_model_config={
+            "model_type": "vit",
+            "num_hidden_layers": 2,
+            "hidden_size": 32,
+            "num_attention_heads": 4,
+            "patch_size": 8,
+        },
         num_labels=3,
     ):
         self.parent = parent
@@ -76,25 +87,25 @@ def __init__(
         self.num_channels = num_channels
         self.is_training = is_training
         self.use_labels = use_labels
-        self.hidden_size = hidden_size
         self.fusion_hidden_size = fusion_hidden_size
         self.intermediate_hook_ids = intermediate_hook_ids
         self.intermediate_feature_dims = intermediate_feature_dims
         self.scaled_images_ratios = scaled_images_ratios
         self.scaled_images_overlap_ratios = scaled_images_overlap_ratios
         self.scaled_images_feature_dims = scaled_images_feature_dims
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
         self.initializer_range = initializer_range
         self.use_fov_model = use_fov_model
-        self.backbone_config = backbone_config
+        self.image_model_config =image_model_config
+        self.patch_model_config =patch_model_config
+        self.fov_model_config =fov_model_config
         self.num_labels = num_labels
 
+        self.hidden_size = image_model_config['hidden_size']
+        self.num_hidden_layers = image_model_config['num_hidden_layers']
+        self.num_attention_heads = image_model_config['num_attention_heads']
+
         # may be different for a backbone other than dinov2
-        self.out_size = patch_size // backbone_config["patch_size"]
+        self.out_size = patch_size // image_model_config['patch_size']
         self.seq_length = self.out_size**2 + 1  # we add 1 for the [CLS] token
 
         n_fusion_blocks = len(intermediate_hook_ids) + len(scaled_images_ratios)
@@ -114,22 +125,17 @@ def prepare_config_and_inputs(self):
     def get_config(self):
         return DepthProConfig(
             patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
             fusion_hidden_size=self.fusion_hidden_size,
             intermediate_hook_ids=self.intermediate_hook_ids,
             intermediate_feature_dims=self.intermediate_feature_dims,
             scaled_images_ratios=self.scaled_images_ratios,
             scaled_images_overlap_ratios=self.scaled_images_overlap_ratios,
             scaled_images_feature_dims=self.scaled_images_feature_dims,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
             initializer_range=self.initializer_range,
+            image_model_config=self.image_model_config,
+            patch_model_config=self.patch_model_config,
+            fov_model_config=self.fov_model_config,
             use_fov_model=self.use_fov_model,
-            backbone_config=self.backbone_config,
         )
 
     def create_and_check_model(self, config, pixel_values, labels):
@@ -137,11 +143,6 @@ def create_and_check_model(self, config, pixel_values, labels):
         model.to(torch_device)
         model.eval()
         result = model(pixel_values)
-        # TODO: return hidden_states from patch_encodings instead of image_encodings
-        # num_patches = result.last_hidden_state.shape[1]  # num_patches are created dynamically
-        # self.parent.assertEqual(
-        #     result.last_hidden_state.shape, (self.batch_size, num_patches, self.seq_length, self.hidden_size)
-        # )
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
     def create_and_check_for_depth_estimation(self, config, pixel_values, labels):

From c26dc99430341aa23443793cab8a95363bfaff9f Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sun, 26 Jan 2025 21:13:18 +0500
Subject: [PATCH 095/151] update config in weight conversion

---
 .../depth_pro/convert_depth_pro_weights_to_hf.py   | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
index 4b6545928a29..9246b7dcd8de 100644
--- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
+++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
@@ -133,8 +133,20 @@ def write_model(
     # ------------------------------------------------------------
 
     # create config
+    backbone_config = {
+        "model_type": "dinov2",
+        "num_hidden_layers": 24,
+        "patch_size": 16,
+        "hidden_size": 1024,
+        "num_attention_heads": 16,
+        "image_size": 384,
+        "use_mask_token": False,
+    }
     config = DepthProConfig(
-        # use default config except for enabling fov model
+        # original implementation uses same config for all 3 models
+        image_model_config=backbone_config,
+        patch_model_config=backbone_config,
+        fov_model_config=backbone_config,
         use_fov_model=True,
     )
 

From 5fb0bb7864b6727c9503bac5134e2003b046b224 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sun, 26 Jan 2025 21:54:46 +0500
Subject: [PATCH 096/151] formatting

---
 .../depth_pro/configuration_depth_pro.py      | 12 +++-----
 .../models/depth_pro/modeling_depth_pro.py    | 30 +++++++++----------
 .../depth_pro/test_modeling_depth_pro.py      | 16 +++++-----
 3 files changed, 27 insertions(+), 31 deletions(-)

diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py
index 52ff8e400f0e..ffc8033b55c4 100644
--- a/src/transformers/models/depth_pro/configuration_depth_pro.py
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@@ -18,7 +18,7 @@
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
-from ..auto.configuration_auto import AutoConfig, CONFIG_MAPPING
+from ..auto.configuration_auto import CONFIG_MAPPING, AutoConfig
 
 
 logger = logging.get_logger(__name__)
@@ -37,10 +37,10 @@ class DepthProConfig(PretrainedConfig):
     Args:
         fusion_hidden_size (`int`, *optional*, defaults to 256):
             The number of channels before fusion.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         patch_size (`int`, *optional*, defaults to 384):
             The size (resolution) of each patch. This is also the image_size for backbone model.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         intermediate_hook_ids (`List[int]`, *optional*, defaults to `[11, 5]`):
             Indices of the intermediate hidden states from the patch encoder to use for fusion.
         intermediate_feature_dims (`List[int]`, *optional*, defaults to `[256, 256]`):
@@ -87,11 +87,7 @@ class DepthProConfig(PretrainedConfig):
     ```"""
 
     model_type = "depth_pro"
-    sub_configs = {
-        "image_model_config": AutoConfig,
-        "patch_model_config": AutoConfig,
-        "fov_model_config": AutoConfig
-    }
+    sub_configs = {"image_model_config": AutoConfig, "patch_model_config": AutoConfig, "fov_model_config": AutoConfig}
 
     def __init__(
         self,
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index a6ede14784b0..86f533f8d0a5 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -19,8 +19,8 @@
 from typing import List, Optional, Tuple, Union
 
 import torch
-from torch import nn
 import torch.nn.functional as F
+from torch import nn
 
 from ...modeling_outputs import BaseModelOutput
 from ...modeling_utils import PreTrainedModel
@@ -204,7 +204,7 @@ def __init__(self, config: DepthProConfig):
         self.upsample_blocks = nn.ModuleDict()
 
         # for image_features
-        self.upsample_blocks['image'] = DepthProFeatureUpsampleBlock(
+        self.upsample_blocks["image"] = DepthProFeatureUpsampleBlock(
             config=config,
             input_dims=config.image_model_config.hidden_size,
             intermediate_dims=config.image_model_config.hidden_size,
@@ -216,7 +216,7 @@ def __init__(self, config: DepthProConfig):
 
         # for scaled_images_features
         for i, feature_dims in enumerate(config.scaled_images_feature_dims):
-            self.upsample_blocks[f'scaled_images_{i}'] = DepthProFeatureUpsampleBlock(
+            self.upsample_blocks[f"scaled_images_{i}"] = DepthProFeatureUpsampleBlock(
                 config=config,
                 input_dims=config.patch_model_config.hidden_size,
                 intermediate_dims=feature_dims,
@@ -227,7 +227,7 @@ def __init__(self, config: DepthProConfig):
         # for intermediate_features
         for i, feature_dims in enumerate(config.intermediate_feature_dims):
             intermediate_dims = config.fusion_hidden_size if i == 0 else feature_dims
-            self.upsample_blocks[f'intermediate_{i}'] = DepthProFeatureUpsampleBlock(
+            self.upsample_blocks[f"intermediate_{i}"] = DepthProFeatureUpsampleBlock(
                 config=config,
                 input_dims=config.patch_model_config.hidden_size,
                 intermediate_dims=intermediate_dims,
@@ -236,13 +236,13 @@ def __init__(self, config: DepthProConfig):
             )
 
     def forward(self, features: List[torch.Tensor]) -> List[torch.Tensor]:
-        features[0] = self.upsample_blocks['image'](features[0])
+        features[0] = self.upsample_blocks["image"](features[0])
 
         for i in range(self.n_scaled_images):
-            features[i + 1] = self.upsample_blocks[f'scaled_images_{i}'](features[i + 1])
+            features[i + 1] = self.upsample_blocks[f"scaled_images_{i}"](features[i + 1])
 
         for i in range(self.n_intermediate_hooks):
-            features[self.n_scaled_images + i + 1] = self.upsample_blocks[f'intermediate_{i}'](
+            features[self.n_scaled_images + i + 1] = self.upsample_blocks[f"intermediate_{i}"](
                 features[self.n_scaled_images + i + 1]
             )
 
@@ -300,7 +300,7 @@ def split_to_patches(pixel_values: torch.Tensor, patch_size: int, overlap_ratio:
 def reshape_features(hidden_states: torch.Tensor) -> torch.Tensor:
     """Discard class token and reshape 1D feature map to a 2D grid."""
     n_samples, seq_len, hidden_size = hidden_states.shape
-    size = torch_int(seq_len ** 0.5)
+    size = torch_int(seq_len**0.5)
 
     hidden_states = hidden_states[:, -(size**2) :, :]  # remove special tokens if there are any
     hidden_states = hidden_states.reshape(n_samples, size, size, hidden_size)
@@ -313,7 +313,7 @@ def merge_patches(patches: torch.Tensor, batch_size: int, padding: int) -> torch
     """Merges smaller patches into image-like feature map."""
     n_patches, hidden_size, out_size, out_size = patches.shape
     n_patches_per_batch = n_patches // batch_size
-    sqrt_n_patches_per_batch = torch_int(n_patches_per_batch ** 0.5)
+    sqrt_n_patches_per_batch = torch_int(n_patches_per_batch**0.5)
     new_out_size = sqrt_n_patches_per_batch * out_size
 
     if n_patches == batch_size:
@@ -379,10 +379,7 @@ def merge_patches(patches: torch.Tensor, batch_size: int, padding: int) -> torch
 
 
 def feature_extractor(
-    hidden_state: torch.Tensor,
-    batch_size: int,
-    padding: int,
-    output_size: Tuple[float, float]
+    hidden_state: torch.Tensor, batch_size: int, padding: int, output_size: Tuple[float, float]
 ) -> torch.Tensor:
     """Converts hidden_state to image like feature map."""
 
@@ -552,7 +549,10 @@ def forward(
                 patch_encodings[2][self.intermediate_hook_ids[i] + 1],
                 batch_size=batch_size,
                 padding=torch_int(self.merge_padding_value * (1 / self.scaled_images_ratios[-1])),
-                output_size=(base_height * 2 ** (self.n_scaled_images - 1), base_width * 2 ** (self.n_scaled_images - 1)),
+                output_size=(
+                    base_height * 2 ** (self.n_scaled_images - 1),
+                    base_width * 2 ** (self.n_scaled_images - 1),
+                ),
             )
             intermediate_features.append(features)
 
@@ -588,7 +588,7 @@ def forward(
         # STEP 11: return output
 
         if not return_dict:
-            return (image_encodings[0], features) + image_encodings[2:] # ignore last_hidden_state and poooler output
+            return (image_encodings[0], features) + image_encodings[2:]  # ignore last_hidden_state and poooler output
 
         return DepthProOutput(
             last_hidden_state=image_encodings.last_hidden_state,
diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py
index 819f7f63c8ce..7a6effb5c7ff 100644
--- a/tests/models/depth_pro/test_modeling_depth_pro.py
+++ b/tests/models/depth_pro/test_modeling_depth_pro.py
@@ -95,17 +95,17 @@ def __init__(
         self.scaled_images_feature_dims = scaled_images_feature_dims
         self.initializer_range = initializer_range
         self.use_fov_model = use_fov_model
-        self.image_model_config =image_model_config
-        self.patch_model_config =patch_model_config
-        self.fov_model_config =fov_model_config
+        self.image_model_config = image_model_config
+        self.patch_model_config = patch_model_config
+        self.fov_model_config = fov_model_config
         self.num_labels = num_labels
 
-        self.hidden_size = image_model_config['hidden_size']
-        self.num_hidden_layers = image_model_config['num_hidden_layers']
-        self.num_attention_heads = image_model_config['num_attention_heads']
+        self.hidden_size = image_model_config["hidden_size"]
+        self.num_hidden_layers = image_model_config["num_hidden_layers"]
+        self.num_attention_heads = image_model_config["num_attention_heads"]
 
         # may be different for a backbone other than dinov2
-        self.out_size = patch_size // image_model_config['patch_size']
+        self.out_size = patch_size // image_model_config["patch_size"]
         self.seq_length = self.out_size**2 + 1  # we add 1 for the [CLS] token
 
         n_fusion_blocks = len(intermediate_hook_ids) + len(scaled_images_ratios)
@@ -350,7 +350,7 @@ def test_inference_depth_estimation(self):
 
         # verify the predicted depth
         n_fusion_blocks = len(config.intermediate_hook_ids) + len(config.scaled_images_ratios)
-        out_size = config.backbone_config.image_size // config.backbone_config.patch_size
+        out_size = config.image_model_config.image_size // config.image_model_config.patch_size
         expected_depth_size = 2 ** (n_fusion_blocks + 1) * out_size
         expected_shape = torch.Size((1, expected_depth_size, expected_depth_size))
         self.assertEqual(predicted_depth.shape, expected_shape)

From d74189017c6c21d9eb390523d7d3da939cce2183 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sun, 26 Jan 2025 22:17:56 +0500
Subject: [PATCH 097/151] converted model is fp32

---
 .../models/depth_pro/convert_depth_pro_weights_to_hf.py       | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
index 9246b7dcd8de..fec2b03ccc16 100644
--- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
+++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
@@ -163,6 +163,10 @@ def write_model(
     # file_path = "/home/geetu/work/hf/depth_pro/depth_pro.pt" # when you already have the files locally
     loaded = torch.load(file_path, weights_only=True)
 
+    # ensure state_dict is in float32
+    for key in loaded.keys():
+        loaded[key] = loaded[key].to(torch.float32)
+
     print("Converting model...")
     all_keys = list(loaded.keys())
     new_keys = convert_old_keys_to_new_keys(all_keys)

From 03f137d86b9bd395621b229b0c8a4bd8336c2612 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Tue, 28 Jan 2025 08:15:03 +0500
Subject: [PATCH 098/151] improve naming and docs for
 feature_extractor->reconstruct_feature_maps

---
 .../models/depth_pro/modeling_depth_pro.py    | 27 ++++++++++++++-----
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 86f533f8d0a5..ae7d742aaf49 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -378,15 +378,28 @@ def merge_patches(patches: torch.Tensor, batch_size: int, padding: int) -> torch
     return merged
 
 
-def feature_extractor(
+def reconstruct_feature_maps(
     hidden_state: torch.Tensor, batch_size: int, padding: int, output_size: Tuple[float, float]
 ) -> torch.Tensor:
-    """Converts hidden_state to image like feature map."""
+    """
+    Reconstructs feature maps from the hidden state produced by any of the encoder. Converts the hidden state of shape
+    `(n_patches_per_batch * batch_size, seq_len, hidden_size)` to feature maps of shape
+    `(batch_size, hidden_size, output_size[0], output_size[1])`.
 
+    Args:
+        hidden_state (torch.Tensor): Input tensor of shape `(n_patches_per_batch * batch_size, seq_len, hidden_size)`
+            representing the encoded patches.
+        batch_size (int): The number of samples in a batch.
+        padding (int): The amount of padding to be removed when merging patches.
+        output_size (Tuple[float, float]): The desired output size for the feature maps, specified as `(height, width)`.
+
+    Returns:
+        torch.Tensor: Reconstructed feature maps of shape `(batch_size, hidden_size, output_size[0], output_size[1])`.
+    """
     # reshape back to image like
     features = reshape_features(hidden_state)
 
-    # merge patches back together
+    # merge all patches in a batch to create one large patch per batch
     features = merge_patches(
         features,
         batch_size=batch_size,
@@ -532,7 +545,7 @@ def forward(
 
         scaled_images_features = []
         for i in range(self.n_scaled_images):
-            features = feature_extractor(
+            features = reconstruct_feature_maps(
                 scaled_images_last_hidden_state[i],
                 batch_size=batch_size,
                 padding=torch_int(self.merge_padding_value * (1 / self.scaled_images_ratios[i])),
@@ -544,7 +557,7 @@ def forward(
 
         intermediate_features = []
         for i in range(self.n_intermediate_hooks):
-            features = feature_extractor(
+            features = reconstruct_feature_maps(
                 # +1 to correct index position as hidden_states contain embedding output as well
                 patch_encodings[2][self.intermediate_hook_ids[i] + 1],
                 batch_size=batch_size,
@@ -558,7 +571,7 @@ def forward(
 
         # STEP 6: get image features - (6) in diagram
 
-        image_features = feature_extractor(
+        image_features = reconstruct_feature_maps(
             image_encodings[0],
             batch_size=batch_size,
             padding=0,
@@ -894,7 +907,7 @@ def forward(
         hidden_state = encodings[0]
         hidden_state = self.encoder_neck(hidden_state)
 
-        fov_features = feature_extractor(
+        fov_features = reconstruct_feature_maps(
             hidden_state,
             batch_size=batch_size,
             padding=0,

From 2b8ee8fccb8c4e9fc5ff360576defaad76d68bd0 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 29 Jan 2025 00:00:15 +0500
Subject: [PATCH 099/151] minor fixes; amid review

---
 .../depth_pro/image_processing_depth_pro.py   |  19 ++--
 .../image_processing_depth_pro_fast.py        |   6 +-
 .../models/depth_pro/modeling_depth_pro.py    | 101 +++++++++---------
 3 files changed, 62 insertions(+), 64 deletions(-)

diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py
index 76a12577dd63..8b7c84b71943 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py
@@ -166,17 +166,16 @@ def resize(
         # image.resize expects all values to be in range [0, 1] or [0, 255] and throws an exception otherwise,
         # however pytorch interpolation works with negative values.
         # relevant issue here: https://github.com/huggingface/transformers/issues/34920
-        return (
-            torch.nn.functional.interpolate(
-                # input should be (B, C, H, W)
-                input=torch.from_numpy(image).unsqueeze(0),
-                size=output_size,
-                mode=pil_torch_interpolation_mapping[resample].value,
-                antialias=antialias,
-            )
-            .squeeze(0)
-            .numpy()
+        # input should be (B, C, H, W)
+        image_tensor = torch.from_numpy(image).unsqueeze(0)
+        resized_image = torch.nn.functional.interpolate(
+            input=image_tensor,
+            size=output_size,
+            mode=pil_torch_interpolation_mapping[resample].value,
+            antialias=antialias,
         )
+        resized_image = resized_image.squeeze(0).numpy()
+        return resized_image
 
     def _validate_input_arguments(
         self,
diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
index 521e5b8a0628..482755a83f52 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
@@ -17,10 +17,6 @@
 import functools
 from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
 
-
-if TYPE_CHECKING:
-    from ...modeling_outputs import DepthProDepthEstimatorOutput
-
 from ...image_processing_base import BatchFeature
 from ...image_processing_utils import get_size_dict
 from ...image_processing_utils_fast import BaseImageProcessorFast, SizeDict
@@ -40,6 +36,8 @@
 from ...utils.import_utils import is_torch_available, is_torchvision_available
 
 
+if TYPE_CHECKING:
+    from ...modeling_outputs import DepthProDepthEstimatorOutput
 logger = logging.get_logger(__name__)
 
 
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index ae7d742aaf49..8e5435527b22 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -38,56 +38,6 @@
 
 logger = logging.get_logger(__name__)
 
-# General docstring
-_CONFIG_FOR_DOC = "DepthProConfig"
-
-
-DEPTH_PRO_START_DOCSTRING = r"""
-    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
-    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
-    behavior.
-
-    Parameters:
-        config ([`DepthProConfig`]): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-DEPTH_PRO_INPUTS_DOCSTRING = r"""
-    Args:
-        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`DPTImageProcessor.__call__`]
-            for details.
-
-        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
-            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
-"""
-
-DEPTH_PRO_FOR_DEPTH_ESTIMATION_START_DOCSTRING = r"""
-    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
-    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
-    behavior.
-
-    Parameters:
-        config ([`DepthProConfig`]): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-        use_fov_model (`bool`, *optional*, defaults to `True`):
-            Whether to use `DepthProFOVModel` to generate the field of view.
-"""
-
 
 @dataclass
 class DepthProOutput(ModelOutput):
@@ -611,6 +561,57 @@ def forward(
         )
 
 
+# General docstring
+_CONFIG_FOR_DOC = "DepthProConfig"
+
+
+DEPTH_PRO_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`DepthProConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+DEPTH_PRO_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`DPTImageProcessor.__call__`]
+            for details.
+
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+"""
+
+DEPTH_PRO_FOR_DEPTH_ESTIMATION_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`DepthProConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+        use_fov_model (`bool`, *optional*, defaults to `True`):
+            Whether to use `DepthProFOVModel` to generate the field of view.
+"""
+
+
 class DepthProPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained

From 774617a6f8c0f1f2a2762d837a2e2791e771a34a Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 29 Jan 2025 00:13:30 +0500
Subject: [PATCH 100/151] create intermediate vars in func call

---
 .../models/depth_pro/modeling_depth_pro.py    | 26 ++++++++++++-------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 8e5435527b22..bbbfa3aabad4 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -495,11 +495,16 @@ def forward(
 
         scaled_images_features = []
         for i in range(self.n_scaled_images):
+            hidden_state = scaled_images_last_hidden_state[i]
+            batch_size = batch_size
+            padding = torch_int(self.merge_padding_value * (1 / self.scaled_images_ratios[i]))
+            output_height = base_height * 2**i
+            output_width = base_width * 2**i
             features = reconstruct_feature_maps(
-                scaled_images_last_hidden_state[i],
+                hidden_state,
                 batch_size=batch_size,
-                padding=torch_int(self.merge_padding_value * (1 / self.scaled_images_ratios[i])),
-                output_size=(base_height * 2**i, base_width * 2**i),
+                padding=padding,
+                output_size=(output_height, output_width),
             )
             scaled_images_features.append(features)
 
@@ -507,15 +512,16 @@ def forward(
 
         intermediate_features = []
         for i in range(self.n_intermediate_hooks):
+            # +1 to correct index position as hidden_states contain embedding output as well
+            hidden_state = patch_encodings[2][self.intermediate_hook_ids[i] + 1]
+            padding = torch_int(self.merge_padding_value * (1 / self.scaled_images_ratios[-1]))
+            output_height = base_height * 2 ** (self.n_scaled_images - 1)
+            output_width = base_width * 2 ** (self.n_scaled_images - 1)
             features = reconstruct_feature_maps(
-                # +1 to correct index position as hidden_states contain embedding output as well
-                patch_encodings[2][self.intermediate_hook_ids[i] + 1],
+                hidden_state,
                 batch_size=batch_size,
-                padding=torch_int(self.merge_padding_value * (1 / self.scaled_images_ratios[-1])),
-                output_size=(
-                    base_height * 2 ** (self.n_scaled_images - 1),
-                    base_width * 2 ** (self.n_scaled_images - 1),
-                ),
+                padding=padding,
+                output_size=(output_height, output_width),
             )
             intermediate_features.append(features)
 

From b6d15ff82dff3c781b09a0351a31db7dac227a0a Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 29 Jan 2025 00:27:55 +0500
Subject: [PATCH 101/151] use torch.testing.assert_close

---
 tests/models/depth_pro/test_modeling_depth_pro.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py
index 7a6effb5c7ff..0858c8c2a0e6 100644
--- a/tests/models/depth_pro/test_modeling_depth_pro.py
+++ b/tests/models/depth_pro/test_modeling_depth_pro.py
@@ -359,7 +359,7 @@ def test_inference_depth_estimation(self):
             [[1.0582, 1.1225, 1.1335], [1.1154, 1.1398, 1.1486], [1.1434, 1.1500, 1.1643]]
         ).to(torch_device)
 
-        self.assertTrue(torch.allclose(outputs.predicted_depth[0, :3, :3], expected_slice, atol=1e-4))
+        torch.testing.assert_close(outputs.predicted_depth[0, :3, :3], expected_slice, atol=1e-4, rtol=0)
 
     def test_post_processing_depth_estimation(self):
         model_path = "geetu040/DepthPro"

From 425d63eb14d6d1353f647a176573086621db8401 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 29 Jan 2025 00:58:39 +0500
Subject: [PATCH 102/151] use ModuleList instead of Sequential and ModuleDict

---
 .../convert_depth_pro_weights_to_hf.py        |  6 ++---
 .../models/depth_pro/modeling_depth_pro.py    | 24 ++++++++++---------
 2 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
index fec2b03ccc16..e17e53efe8cc 100644
--- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
+++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
@@ -59,12 +59,12 @@
     r"fov.head.head.(\d+).(weight|bias)":                                       r"fov_model.head.\1.\2",
 
     # upsamples
-    r"encoder.upsample_lowres.(weight|bias)":                                   r"depth_pro.encoder.feature_upsample.upsample_blocks.image.layers.0.\1",
+    r"encoder.upsample_lowres.(weight|bias)":                                   r"depth_pro.encoder.feature_upsample.image_block.layers.0.\1",
     r"encoder.upsample_latent(\d+).(\d+).(weight|bias)": lambda match: (
-        f"depth_pro.encoder.feature_upsample.upsample_blocks.intermediate_{1-int(match.group(1))}.layers.{match.group(2)}.{match.group(3)}"
+        f"depth_pro.encoder.feature_upsample.intermediate.{1-int(match.group(1))}.layers.{match.group(2)}.{match.group(3)}"
     ),
     r"encoder.upsample(\d+).(\d+).(weight|bias)": lambda match: (
-        f"depth_pro.encoder.feature_upsample.upsample_blocks.scaled_images_{2-int(match.group(1))}.layers.{match.group(2)}.{match.group(3)}"
+        f"depth_pro.encoder.feature_upsample.scaled_images.{2-int(match.group(1))}.layers.{match.group(2)}.{match.group(3)}"
     ),
 
     # projections between encoder and fusion
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index bbbfa3aabad4..eabcbe990dbf 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -113,7 +113,7 @@ def __init__(
     ):
         super().__init__()
         self.config = config
-        self.layers = nn.Sequential()
+        self.layers = nn.ModuleList()
 
         # create first projection layer
         if use_proj:
@@ -141,7 +141,8 @@ def __init__(
             self.layers.append(layer)
 
     def forward(self, features: torch.Tensor) -> torch.Tensor:
-        features = self.layers(features)
+        for layer in self.layers:
+            features = layer(features)
         return features
 
 
@@ -151,10 +152,9 @@ def __init__(self, config: DepthProConfig):
         self.config = config
         self.n_scaled_images = len(self.config.scaled_images_ratios)
         self.n_intermediate_hooks = len(self.config.intermediate_hook_ids)
-        self.upsample_blocks = nn.ModuleDict()
 
         # for image_features
-        self.upsample_blocks["image"] = DepthProFeatureUpsampleBlock(
+        self.image_block = DepthProFeatureUpsampleBlock(
             config=config,
             input_dims=config.image_model_config.hidden_size,
             intermediate_dims=config.image_model_config.hidden_size,
@@ -165,36 +165,38 @@ def __init__(self, config: DepthProConfig):
         )
 
         # for scaled_images_features
+        self.scaled_images = nn.ModuleList()
         for i, feature_dims in enumerate(config.scaled_images_feature_dims):
-            self.upsample_blocks[f"scaled_images_{i}"] = DepthProFeatureUpsampleBlock(
+            block = DepthProFeatureUpsampleBlock(
                 config=config,
                 input_dims=config.patch_model_config.hidden_size,
                 intermediate_dims=feature_dims,
                 output_dims=feature_dims,
                 n_upsample_layers=1,
             )
+            self.scaled_images.append(block)
 
         # for intermediate_features
+        self.intermediate = nn.ModuleList()
         for i, feature_dims in enumerate(config.intermediate_feature_dims):
             intermediate_dims = config.fusion_hidden_size if i == 0 else feature_dims
-            self.upsample_blocks[f"intermediate_{i}"] = DepthProFeatureUpsampleBlock(
+            block = DepthProFeatureUpsampleBlock(
                 config=config,
                 input_dims=config.patch_model_config.hidden_size,
                 intermediate_dims=intermediate_dims,
                 output_dims=feature_dims,
                 n_upsample_layers=2 + i,
             )
+            self.intermediate.append(block)
 
     def forward(self, features: List[torch.Tensor]) -> List[torch.Tensor]:
-        features[0] = self.upsample_blocks["image"](features[0])
+        features[0] = self.image_block(features[0])
 
         for i in range(self.n_scaled_images):
-            features[i + 1] = self.upsample_blocks[f"scaled_images_{i}"](features[i + 1])
+            features[i + 1] = self.scaled_images[i](features[i + 1])
 
         for i in range(self.n_intermediate_hooks):
-            features[self.n_scaled_images + i + 1] = self.upsample_blocks[f"intermediate_{i}"](
-                features[self.n_scaled_images + i + 1]
-            )
+            features[self.n_scaled_images + i + 1] = self.intermediate[i](features[self.n_scaled_images + i + 1])
 
         return features
 

From f415ee6d5dbc441701fe34c81806a8a106405aee Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 29 Jan 2025 02:14:51 +0500
Subject: [PATCH 103/151] update docs

---
 docs/source/en/model_doc/depth_pro.md | 118 +++++++++++++++++++++++---
 1 file changed, 105 insertions(+), 13 deletions(-)

diff --git a/docs/source/en/model_doc/depth_pro.md b/docs/source/en/model_doc/depth_pro.md
index 3850c500995a..2b74557b41b1 100644
--- a/docs/source/en/model_doc/depth_pro.md
+++ b/docs/source/en/model_doc/depth_pro.md
@@ -35,27 +35,117 @@ This model was contributed by [geetu040](https://github.com/geetu040). The origi
 
 <!-- TODO -->
 
-## Usage tips
+Here's an improved version of your documentation with enhanced clarity, formatting, and structure for easier understanding:
 
+---
+
+## **Usage Tips**
+
+Initialize the Model with Default Configuration
 ```python
-from transformers import DepthProConfig, DepthProForDepthEstimation
+from transformers import DepthProConfig, DepthProModel
 
 config = DepthProConfig()
-model = DepthProForDepthEstimation(config=config)
+model = DepthProModel(config=config)
 ```
 
-- Input image is scaled with different ratios, as specified in `scaled_images_ratios`, and each of the scaled image is patched to `patch_size` with an overlap ratio of `scaled_images_overlap_ratios`.
-- These patches go through `DinoV2 (ViT)` based encoders and are reassembled via a `DPT` based decoder.
-- `DepthProForDepthEstimation` can also predict the `FOV (Field of View)` if `use_fov_model` is set to `True` in the config.
-- `DepthProImageProcessor` can be used for preprocessing the inputs and postprocessing the outputs. `DepthProImageProcessor.post_process_depth_estimation` interpolates the `predicted_depth` back to match the input image size.
-- To generate `predicted_depth` of the same size as input image, make sure the config is created such that
-```
-image_size / 2**(n_fusion_blocks+1) == patch_size / patch_embeddings_size
+Load a Pre-Trained Model for Depth Estimation
+```python
+from transformers import DepthProConfig, DepthProForDepthEstimation
 
-where
-n_fusion_blocks = len(intermediate_hook_ids) + len(scaled_images_ratios)
+checkpoint = "geetu040/DepthPro"
+model = DepthProForDepthEstimation.from_pretrained(checkpoint)
+config = model.config
 ```
 
+Key Features and Configuration Details
+
+1. Dual-Encoder Architecture:
+   - The `DepthProModel` uses **two encoders**:
+     - **`image_encoder`** and **`patch_encoder`**, which can be configured via `image_model_config` and `patch_model_config` in the configuration.
+   - By default, and in the pre-trained model, both encoders use the **`Dinov2Model`** architecture.
+
+2. Image Scaling and Patch Processing:
+   - Input images are scaled with multiple ratios, as specified in the `scaled_images_ratios` configuration.
+   - Each scaled image is split into smaller **patches** of size `patch_size` with overlapping areas determined by `scaled_images_overlap_ratios`.
+   - These patches are processed by the **`patch_encoder`**, while the image is also rescaled to `patch_size` and is processed by the **`image_encoder`**.
+   - Outputs from both encoders (`last_hidden_state`) and selected intermediate states (`hidden_states`) from **`patch_encoder`** are fused by a `DPT`-based `FeatureFusionStage` for depth estimation.
+
+3. Optional Field of View (FOV) Prediction:
+   - If `use_fov_model` is set to `True` in the configuration, the model predicts the **Field of View (FOV)** using a third encoder.
+   - This encoder also scales the image to `patch_size` and uses its `last_hidden_state` for FOV prediction. The encoder can be specified in the configuration using `fov_model_config`.
+
+4. Configuration and Validation:
+   - All encoders receive input images of size `patch_size`.
+   - The `image_size` for each encoder in the configuration should match the `patch_size`. This is validated when creating a `DepthProConfig`.
+
+5. Preprocessing and Postprocessing:
+   - Use the `DepthProImageProcessor` for preparing inputs and processing outputs:
+     - **Preprocessing**: Prepare images (rescale, normalize, resize) for model input.
+     - **Postprocessing**: Use `DepthProImageProcessor.post_process_depth_estimation` to interpolate the predicted depth to match the original input image size.
+
+6. Support for Variable Resolution and Aspect Ratios:
+   - The `DepthProModel` can process images with different resolutions and aspect ratios. However, for generating predicted depths that match the input image size, ensure the configuration satisfies:
+   ```py
+   input_image_size / 2**(n_fusion_blocks + 1) == image_model_config.image_size / image_model_config.patch_size
+   ```
+
+   - **Where**:
+     - `input_image_size`: The size of the input image.
+     - `image_model_config.image_size`: Image size for **`image_encoder`** which equals to `patch_size` in `DepthProConfig`.
+     - `n_fusion_blocks`: Total fusion blocks, calculated as:
+       ```py
+       len(intermediate_hook_ids) + len(scaled_images_ratios)
+       ```
+
+### **Customizing Encoders in `DepthProModel`**
+
+The `DepthProModel` architecture uses **three encoders**, each responsible for a specific task:
+
+1. **Patch Encoder**: Processes image patches created by splitting the input image.
+2. **Image Encoder**: Processes the input image resized to `patch_size`.
+3. **FOV (Field of View) Encoder**: Generates the Field of View (FOV), if `use_fov_model` is enabled.
+
+You can configure each encoder to use any compatible model architecture. For example, to use:
+- **`ViT` (Vision Transformer)** as the **patch encoder**, and
+- **`BEiT`** as the **image encoder**, and
+- **`DinoV2`** as the **FOV encoder**.
+
+```python
+from transformers import DepthProConfig, DepthProForDepthEstimation
+
+config = DepthProConfig(
+    patch_model_config={
+        "model_type": "vit",
+        "num_hidden_layers": 6,
+        "patch_size": 16,
+        "hidden_size": 512,
+        "num_attention_heads": 16,
+        "image_size": 384,  # matches `patch_size`
+    },
+    image_model_config={
+        "model_type": "beit",
+        "num_hidden_layers": 4,
+        "patch_size": 8,
+        "hidden_size": 256,
+        "num_attention_heads": 8,
+        "image_size": 384,  # matches `patch_size`
+    },
+    fov_model_config={
+        "model_type": "dinov2",
+        "num_hidden_layers": 4,
+        "patch_size": 8,
+        "hidden_size": 256,
+        "num_attention_heads": 8,
+        "image_size": 384,  # matches `patch_size`
+    },
+    patch_size=384,
+    # uses layers from the patch encoder
+    intermediate_hook_ids=[5, 1],
+    use_fov_model=True,
+)
+model = DepthProForDepthEstimation(config)
+```
 
 ### Using Scaled Dot Product Attention (SDPA)
 
@@ -87,8 +177,10 @@ On a local benchmark (A100-40GB, PyTorch 2.3.0, OS Ubuntu 22.04) with `float32`
 ## Resources
 
 - Research Paper: [Depth Pro: Sharp Monocular Metric Depth in Less Than a Second](https://arxiv.org/pdf/2410.02073)
-
 - Official Implementation: [apple/ml-depth-pro](https://github.com/apple/ml-depth-pro)
+- DepthPro for Super Resolution and Image Segmentation
+    - Read blog on Medium: [Depth Pro: Beyond Depth](https://medium.com/@raoarmaghanshakir040/depth-pro-beyond-depth-9d822fc557ba)
+    - Code on Github: [geetu040/depthpro-beyond-depth](https://github.com/geetu040/depthpro-beyond-depth)
 
 <!-- A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DepthPro. -->
 

From 1a2dd3af42495a250b22fead79fefca4ec283634 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Thu, 30 Jan 2025 14:13:30 +0500
Subject: [PATCH 104/151] include fov in integraiton tests

---
 tests/models/depth_pro/test_modeling_depth_pro.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py
index 0858c8c2a0e6..4347d507ae34 100644
--- a/tests/models/depth_pro/test_modeling_depth_pro.py
+++ b/tests/models/depth_pro/test_modeling_depth_pro.py
@@ -346,20 +346,26 @@ def test_inference_depth_estimation(self):
         # forward pass
         with torch.no_grad():
             outputs = model(**inputs)
-            predicted_depth = outputs.predicted_depth
 
         # verify the predicted depth
         n_fusion_blocks = len(config.intermediate_hook_ids) + len(config.scaled_images_ratios)
         out_size = config.image_model_config.image_size // config.image_model_config.patch_size
         expected_depth_size = 2 ** (n_fusion_blocks + 1) * out_size
+
         expected_shape = torch.Size((1, expected_depth_size, expected_depth_size))
-        self.assertEqual(predicted_depth.shape, expected_shape)
+        self.assertEqual(outputs.predicted_depth.shape, expected_shape)
 
         expected_slice = torch.tensor(
             [[1.0582, 1.1225, 1.1335], [1.1154, 1.1398, 1.1486], [1.1434, 1.1500, 1.1643]]
         ).to(torch_device)
+        torch.testing.assert_close(outputs.predicted_depth[0, :3, :3], expected_slice, atol=1e-4, rtol=1e-4)
+
+        # verify the predicted fov
+        expected_shape = torch.Size((1,))
+        self.assertEqual(outputs.fov.shape, expected_shape)
 
-        torch.testing.assert_close(outputs.predicted_depth[0, :3, :3], expected_slice, atol=1e-4, rtol=0)
+        expected_slice = torch.tensor([47.2459]).to(torch_device)
+        torch.testing.assert_close(outputs.fov, expected_slice, atol=1e-4, rtol=1e-4)
 
     def test_post_processing_depth_estimation(self):
         model_path = "geetu040/DepthPro"

From 4cfebaebfc34cf5b16933010f46ba51c42710c0d Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Thu, 30 Jan 2025 19:29:44 +0500
Subject: [PATCH 105/151] update docs

---
 docs/source/en/model_doc/depth_pro.md | 186 ++++++++++++--------------
 1 file changed, 83 insertions(+), 103 deletions(-)

diff --git a/docs/source/en/model_doc/depth_pro.md b/docs/source/en/model_doc/depth_pro.md
index 2b74557b41b1..20b526dda76e 100644
--- a/docs/source/en/model_doc/depth_pro.md
+++ b/docs/source/en/model_doc/depth_pro.md
@@ -20,133 +20,112 @@ rendered properly in your Markdown viewer.
 
 The DepthPro model was proposed in [Depth Pro: Sharp Monocular Metric Depth in Less Than a Second](https://arxiv.org/abs/2410.02073) by Aleksei Bochkovskii, Amaël Delaunoy, Hugo Germain, Marcel Santos, Yichao Zhou, Stephan R. Richter, Vladlen Koltun.
 
-It leverages a multi-scale [Vision Transformer (ViT)](vit) optimized for dense predictions. It downsamples an image at several scales. At each scale, it is split into patches, which are processed by a ViT-based [Dinov2](dinov2) patch encoder, with weights shared across scales. Patches are merged into feature maps, upsampled, and fused via a [DPT](dpt)-like decoder.
+DepthPro is a foundation model for zero-shot metric monocular depth estimation, designed to generate high-resolution depth maps with remarkable sharpness and fine-grained details. It employs a multi-scale Vision Transformer (ViT)-based architecture, where images are downsampled, divided into patches, and processed using a shared Dinov2 encoder. The extracted patch-level features are merged, upsampled, and refined using a DPT-like fusion stage, enabling precise depth estimation.
 
 The abstract from the paper is the following:
 
 *We present a foundation model for zero-shot metric monocular depth estimation. Our model, Depth Pro, synthesizes high-resolution depth maps with unparalleled sharpness and high-frequency details. The predictions are metric, with absolute scale, without relying on the availability of metadata such as camera intrinsics. And the model is fast, producing a 2.25-megapixel depth map in 0.3 seconds on a standard GPU. These characteristics are enabled by a number of technical contributions, including an efficient multi-scale vision transformer for dense prediction, a training protocol that combines real and synthetic datasets to achieve high metric accuracy alongside fine boundary tracing, dedicated evaluation metrics for boundary accuracy in estimated depth maps, and state-of-the-art focal length estimation from a single image. Extensive experiments analyze specific design choices and demonstrate that Depth Pro outperforms prior work along multiple dimensions.*
 
-<img src="https://huggingface.co/geetu040/DepthPro/resolve/main/assets/architecture.jpg"
+<img src="https://raw.githubusercontent.com/apple/ml-depth-pro/main/data/depth-pro-teaser.jpg"
 alt="drawing" width="600"/>
 
-<small> DepthPro architecture. Taken from the <a href="https://arxiv.org/abs/2410.02073" target="_blank">original paper</a>. </small>
+<small> DepthPro Outputs. Taken from the <a href="https://github.com/apple/ml-depth-pro" target="_blank">official code</a>. </small>
 
 This model was contributed by [geetu040](https://github.com/geetu040). The original code can be found [here](https://github.com/apple/ml-depth-pro).
 
-<!-- TODO -->
+## Usage Tips
+
+The DepthPro model processes an input image by first downsampling it at multiple scales and splitting each scaled version into patches. These patches are then encoded using a shared Vision Transformer (ViT)-based Dinov2 patch encoder, while the full image is processed by a separate image encoder. The extracted patch features are merged into feature maps, upsampled, and fused using a DPT-like decoder to generate the final depth estimation. If enabled, an additional Field of View (FOV) encoder processes the image for estimating the camera's field of view, aiding in depth accuracy.
 
-Here's an improved version of your documentation with enhanced clarity, formatting, and structure for easier understanding:
+```py
+>>> import requests
+>>> from PIL import Image
+>>> import torch
+>>> from transformers import DepthProImageProcessorFast, DepthProForDepthEstimation
 
----
+>>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+>>> image = Image.open(requests.get(url, stream=True).raw)
 
-## **Usage Tips**
+>>> image_processor = DepthProImageProcessorFast.from_pretrained("geetu040/DepthPro")
+>>> model = DepthProForDepthEstimation.from_pretrained("geetu040/DepthPro")
 
-Initialize the Model with Default Configuration
-```python
-from transformers import DepthProConfig, DepthProModel
+>>> inputs = image_processor(images=image, return_tensors="pt")
 
-config = DepthProConfig()
-model = DepthProModel(config=config)
-```
+>>> with torch.no_grad():
+...     outputs = model(**inputs)
 
-Load a Pre-Trained Model for Depth Estimation
-```python
-from transformers import DepthProConfig, DepthProForDepthEstimation
+>>> post_processed_output = image_processor.post_process_depth_estimation(
+...     outputs, target_sizes=[(image.height, image.width)],
+... )
 
-checkpoint = "geetu040/DepthPro"
-model = DepthProForDepthEstimation.from_pretrained(checkpoint)
-config = model.config
+>>> fov = post_processed_output[0]["fov"]
+>>> depth = post_processed_output[0]["predicted_depth"]
+>>> depth = (depth - depth.min()) / depth.max()
+>>> depth = depth * 255.
+>>> depth = depth.detach().cpu().numpy()
+>>> depth = Image.fromarray(depth.astype("uint8"))
 ```
 
-Key Features and Configuration Details
+### Architecture and Configuration
 
-1. Dual-Encoder Architecture:
-   - The `DepthProModel` uses **two encoders**:
-     - **`image_encoder`** and **`patch_encoder`**, which can be configured via `image_model_config` and `patch_model_config` in the configuration.
-   - By default, and in the pre-trained model, both encoders use the **`Dinov2Model`** architecture.
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/depth_pro_architecture.png"
+alt="drawing" width="600"/>
+
+<small> DepthPro architecture. Taken from the <a href="https://arxiv.org/abs/2410.02073" target="_blank">original paper</a>. </small>
 
-2. Image Scaling and Patch Processing:
-   - Input images are scaled with multiple ratios, as specified in the `scaled_images_ratios` configuration.
+The `DepthProForDepthEstimation` model uses a `DepthProEncoder`, for encoding the input image and a `FeatureFusionStage` for fusing the output features from encoder.
+
+The `DepthProEncoder` further uses two encoders:
+- `patch_encoder`
+   - Input image is scaled with multiple ratios, as specified in the `scaled_images_ratios` configuration.
    - Each scaled image is split into smaller **patches** of size `patch_size` with overlapping areas determined by `scaled_images_overlap_ratios`.
-   - These patches are processed by the **`patch_encoder`**, while the image is also rescaled to `patch_size` and is processed by the **`image_encoder`**.
-   - Outputs from both encoders (`last_hidden_state`) and selected intermediate states (`hidden_states`) from **`patch_encoder`** are fused by a `DPT`-based `FeatureFusionStage` for depth estimation.
-
-3. Optional Field of View (FOV) Prediction:
-   - If `use_fov_model` is set to `True` in the configuration, the model predicts the **Field of View (FOV)** using a third encoder.
-   - This encoder also scales the image to `patch_size` and uses its `last_hidden_state` for FOV prediction. The encoder can be specified in the configuration using `fov_model_config`.
-
-4. Configuration and Validation:
-   - All encoders receive input images of size `patch_size`.
-   - The `image_size` for each encoder in the configuration should match the `patch_size`. This is validated when creating a `DepthProConfig`.
-
-5. Preprocessing and Postprocessing:
-   - Use the `DepthProImageProcessor` for preparing inputs and processing outputs:
-     - **Preprocessing**: Prepare images (rescale, normalize, resize) for model input.
-     - **Postprocessing**: Use `DepthProImageProcessor.post_process_depth_estimation` to interpolate the predicted depth to match the original input image size.
-
-6. Support for Variable Resolution and Aspect Ratios:
-   - The `DepthProModel` can process images with different resolutions and aspect ratios. However, for generating predicted depths that match the input image size, ensure the configuration satisfies:
-   ```py
-   input_image_size / 2**(n_fusion_blocks + 1) == image_model_config.image_size / image_model_config.patch_size
-   ```
-
-   - **Where**:
-     - `input_image_size`: The size of the input image.
-     - `image_model_config.image_size`: Image size for **`image_encoder`** which equals to `patch_size` in `DepthProConfig`.
-     - `n_fusion_blocks`: Total fusion blocks, calculated as:
-       ```py
-       len(intermediate_hook_ids) + len(scaled_images_ratios)
-       ```
-
-### **Customizing Encoders in `DepthProModel`**
-
-The `DepthProModel` architecture uses **three encoders**, each responsible for a specific task:
-
-1. **Patch Encoder**: Processes image patches created by splitting the input image.
-2. **Image Encoder**: Processes the input image resized to `patch_size`.
-3. **FOV (Field of View) Encoder**: Generates the Field of View (FOV), if `use_fov_model` is enabled.
-
-You can configure each encoder to use any compatible model architecture. For example, to use:
-- **`ViT` (Vision Transformer)** as the **patch encoder**, and
-- **`BEiT`** as the **image encoder**, and
-- **`DinoV2`** as the **FOV encoder**.
-
-```python
-from transformers import DepthProConfig, DepthProForDepthEstimation
-
-config = DepthProConfig(
-    patch_model_config={
-        "model_type": "vit",
-        "num_hidden_layers": 6,
-        "patch_size": 16,
-        "hidden_size": 512,
-        "num_attention_heads": 16,
-        "image_size": 384,  # matches `patch_size`
-    },
-    image_model_config={
-        "model_type": "beit",
-        "num_hidden_layers": 4,
-        "patch_size": 8,
-        "hidden_size": 256,
-        "num_attention_heads": 8,
-        "image_size": 384,  # matches `patch_size`
-    },
-    fov_model_config={
-        "model_type": "dinov2",
-        "num_hidden_layers": 4,
-        "patch_size": 8,
-        "hidden_size": 256,
-        "num_attention_heads": 8,
-        "image_size": 384,  # matches `patch_size`
-    },
-    patch_size=384,
-    # uses layers from the patch encoder
-    intermediate_hook_ids=[5, 1],
-    use_fov_model=True,
-)
-model = DepthProForDepthEstimation(config)
+   - These patches are processed by the **`patch_encoder`**
+- `image_encoder`
+   - Input image is also rescaled to `patch_size` and processed by the **`image_encoder`**
+
+Both these encoders can be configured via `patch_model_config` and `image_model_config` respectively, both of which are seperate `Dinov2Model` by default.
+
+Outputs from both encoders (`last_hidden_state`) and selected intermediate states (`hidden_states`) from **`patch_encoder`** are fused by a `DPT`-based `FeatureFusionStage` for depth estimation.
+
+### Field-of-View (FOV) Prediction
+
+The network is supplemented with a focal length estimation head. A small convolutional head ingests frozen features from the depth estimation network and task-specific features from a separate ViT image encoder to predict the horizontal angular field-of-view.
+
+The `use_fov_model` parameter in `DepthProConfig` controls whether **FOV prediction** is enabled. By default, it is set to `False` to conserve memory and computation. When enabled, the **FOV encoder** is instantiated based on the `fov_model_config` parameter, which defaults to a `Dinov2Model`. The `use_fov_model` parameter can also be passed when initializing the `DepthProForDepthEstimation` model.
+
+The pretrained model at checkpoint `geetu040/DepthPro` uses the FOV encoder. To use the pretrained-model without FOV encoder, set `use_fov_model=False` when loading the model, which saves computation.
+```py
+>>> from transformers import DepthProForDepthEstimation
+>>> model = DepthProForDepthEstimation.from_pretrained("geetu040/DepthPro", use_fov_model=False)
 ```
 
+To instantiate a new model with FOV encoder, set `use_fov_model=True` in the config.
+```py
+>>> from transformers import DepthProConfig, DepthProForDepthEstimation
+>>> config = DepthProConfig(use_fov_model=True)
+>>> model = DepthProForDepthEstimation(config)
+```
+
+Or set `use_fov_model=True` when initializing the model, which overrides the value in config.
+```py
+>>> from transformers import DepthProConfig, DepthProForDepthEstimation
+>>> config = DepthProConfig()
+>>> model = DepthProForDepthEstimation(config, use_fov_model=True)
+```
+
+### Image Resolution and Aspect Ratio
+
+The network can process images of different resolutions and aspect ratios and the predicted depth size can be calculated using the following formula:
+
+$\text{Predicted Depth Size} = \frac{2^{N+1} \cdot S}{P}$
+
+Where:
+- $N = \text{len}(\text{intermediate\_hook\_ids}) + \text{len}(\text{scaled\_images\_ratios})$
+- $S = \text{image\_model\_config.image\_size}$
+- $P = \text{image\_model\_config.patch\_size}$
+
+The aspect ratio of the raw predicted depth is maintained as the aspect ratio of the input image.
+
 ### Using Scaled Dot Product Attention (SDPA)
 
 PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function 
@@ -178,6 +157,7 @@ On a local benchmark (A100-40GB, PyTorch 2.3.0, OS Ubuntu 22.04) with `float32`
 
 - Research Paper: [Depth Pro: Sharp Monocular Metric Depth in Less Than a Second](https://arxiv.org/pdf/2410.02073)
 - Official Implementation: [apple/ml-depth-pro](https://github.com/apple/ml-depth-pro)
+- DepthPro Inference Notebook: [DepthPro Inference](https://github.com/qubvel/transformers-notebooks/blob/main/notebooks/DepthPro_inference.ipynb)
 - DepthPro for Super Resolution and Image Segmentation
     - Read blog on Medium: [Depth Pro: Beyond Depth](https://medium.com/@raoarmaghanshakir040/depth-pro-beyond-depth-9d822fc557ba)
     - Code on Github: [geetu040/depthpro-beyond-depth](https://github.com/geetu040/depthpro-beyond-depth)

From 90627677b2d2e309c26cfd3a3e2e6dfa4acf868b Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Thu, 30 Jan 2025 19:39:02 +0500
Subject: [PATCH 106/151] improve initialization of convolution layers

---
 src/transformers/models/depth_pro/modeling_depth_pro.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index eabcbe990dbf..cc636e4d494a 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -635,7 +635,7 @@ class DepthProPreTrainedModel(PreTrainedModel):
 
     def _init_weights(self, module):
         """Initialize the weights"""
-        if isinstance(module, (nn.Linear, nn.Conv2d, nn.ConvTranspose2d)):
+        if isinstance(module, nn.Linear):
             # Slightly different from the TF version which uses truncated_normal for initialization
             # cf https://github.com/pytorch/pytorch/pull/5617
             module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
@@ -644,6 +644,10 @@ def _init_weights(self, module):
         elif isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
+        elif isinstance(module, (nn.Conv2d, nn.ConvTranspose2d)):
+            nn.init.kaiming_normal_(module.weight, mode='fan_out', nonlinearity='relu')
+            if module.bias is not None:
+                module.bias.data.zero_()
 
 
 @add_start_docstrings(

From fcba6bd16b5cb884f1414ff42ac611a9b417a719 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Thu, 30 Jan 2025 19:43:29 +0500
Subject: [PATCH 107/151] fix unused fov keys

---
 src/transformers/models/depth_pro/modeling_depth_pro.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index cc636e4d494a..a5ff0c48c058 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -632,6 +632,7 @@ class DepthProPreTrainedModel(PreTrainedModel):
     supports_gradient_checkpointing = True
     _supports_sdpa = True
     _no_split_modules = []
+    _keys_to_ignore_on_load_unexpected = ['fov_model.*']
 
     def _init_weights(self, module):
         """Initialize the weights"""

From 56cd570cfc346c44e4609978c7c6527b9d67b4c5 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Thu, 30 Jan 2025 20:42:54 +0500
Subject: [PATCH 108/151] update tests

---
 .../test_image_processing_depth_pro.py        |  6 +++-
 .../depth_pro/test_modeling_depth_pro.py      | 33 +++++++++++--------
 2 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/tests/models/depth_pro/test_image_processing_depth_pro.py b/tests/models/depth_pro/test_image_processing_depth_pro.py
index e9d94151e145..de2f09063a67 100644
--- a/tests/models/depth_pro/test_image_processing_depth_pro.py
+++ b/tests/models/depth_pro/test_image_processing_depth_pro.py
@@ -17,7 +17,7 @@
 import unittest
 
 from transformers.file_utils import is_vision_available
-from transformers.testing_utils import require_torch, require_vision
+from transformers.testing_utils import is_flaky, require_torch, require_vision
 
 from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
 
@@ -111,3 +111,7 @@ def test_image_processor_from_dict_with_kwargs(self):
 
         image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42)
         self.assertEqual(image_processor.size, {"height": 42, "width": 42})
+
+    @is_flaky(max_attempts=5, description="fast and slow, both use torch implementation, see: https://github.com/huggingface/transformers/issues/34920")
+    def test_fast_is_faster_than_slow(self):
+        super().test_fast_is_faster_than_slow()
diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py
index 4347d507ae34..a89f6a1195b6 100644
--- a/tests/models/depth_pro/test_modeling_depth_pro.py
+++ b/tests/models/depth_pro/test_modeling_depth_pro.py
@@ -300,22 +300,27 @@ def test_initialization(self):
         configs_no_init = _config_zero_init(config)
         for model_class in self.all_model_classes:
             model = model_class(config=configs_no_init)
-            # Skip the check for the backbone
-            backbone_params = []
-            for name, module in model.named_modules():
-                if module.__class__.__name__ == "DepthProViTHybridEmbeddings":
-                    backbone_params = [f"{name}.{key}" for key in module.state_dict().keys()]
-                    break
-
             for name, param in model.named_parameters():
+                non_uniform_init_parms = [
+                    # these encoders are vision transformers
+                    # any layer outside these encoders is either Conv2d or ConvTranspose2d
+                    # which use kaiming initialization
+                    "patch_encoder",
+                    "image_encoder",
+                    "fov_model.encoder",
+                ]
                 if param.requires_grad:
-                    if name in backbone_params:
-                        continue
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
+                    if any(x in name for x in non_uniform_init_parms):
+                        self.assertIn(
+                            ((param.data.mean() * 1e9).round() / 1e9).item(),
+                            [0.0, 1.0],
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+                    else:
+                        self.assertTrue(
+                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
 
     @slow
     def test_model_from_pretrained(self):

From 26b1391d0138275895d86da08dd97ceb435dcf3e Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Thu, 30 Jan 2025 21:10:11 +0500
Subject: [PATCH 109/151] ruff format

---
 src/transformers/models/depth_pro/modeling_depth_pro.py   | 4 ++--
 tests/models/depth_pro/test_image_processing_depth_pro.py | 4 +++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index a5ff0c48c058..7e36d3f8f306 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -632,7 +632,7 @@ class DepthProPreTrainedModel(PreTrainedModel):
     supports_gradient_checkpointing = True
     _supports_sdpa = True
     _no_split_modules = []
-    _keys_to_ignore_on_load_unexpected = ['fov_model.*']
+    _keys_to_ignore_on_load_unexpected = ["fov_model.*"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -646,7 +646,7 @@ def _init_weights(self, module):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
         elif isinstance(module, (nn.Conv2d, nn.ConvTranspose2d)):
-            nn.init.kaiming_normal_(module.weight, mode='fan_out', nonlinearity='relu')
+            nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
             if module.bias is not None:
                 module.bias.data.zero_()
 
diff --git a/tests/models/depth_pro/test_image_processing_depth_pro.py b/tests/models/depth_pro/test_image_processing_depth_pro.py
index de2f09063a67..0e830698c0a1 100644
--- a/tests/models/depth_pro/test_image_processing_depth_pro.py
+++ b/tests/models/depth_pro/test_image_processing_depth_pro.py
@@ -112,6 +112,8 @@ def test_image_processor_from_dict_with_kwargs(self):
         image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42)
         self.assertEqual(image_processor.size, {"height": 42, "width": 42})
 
-    @is_flaky(max_attempts=5, description="fast and slow, both use torch implementation, see: https://github.com/huggingface/transformers/issues/34920")
+    @is_flaky(
+        description="fast and slow, both processors use torch implementation, see: https://github.com/huggingface/transformers/issues/34920",
+    )
     def test_fast_is_faster_than_slow(self):
         super().test_fast_is_faster_than_slow()

From 01247f8e29e83be11a5d7e92aa37673a205ae1fe Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Fri, 31 Jan 2025 10:12:26 +0500
Subject: [PATCH 110/151] fix test, amid kaimming initialization

---
 tests/models/depth_pro/test_modeling_depth_pro.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py
index a89f6a1195b6..2f728ada14df 100644
--- a/tests/models/depth_pro/test_modeling_depth_pro.py
+++ b/tests/models/depth_pro/test_modeling_depth_pro.py
@@ -45,7 +45,7 @@ def __init__(
         parent,
         batch_size=8,
         image_size=64,
-        patch_size=8,
+        patch_size=16,
         num_channels=3,
         is_training=True,
         use_labels=True,
@@ -322,6 +322,11 @@ def test_initialization(self):
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                         )
 
+    # this started when switched from normal initialization to kaiming_normal intialization
+    # maybe because the magnitude of offset values from ViT-encoders increases when followed by many convolution layers
+    def test_batching_equivalence(self, atol=1e-4, rtol=1e-4):
+        super().test_batching_equivalence(atol=atol, rtol=rtol)
+
     @slow
     def test_model_from_pretrained(self):
         model_path = "geetu040/DepthPro"

From 0b7e77fbb750a0fce386186f461f5dd564f498e2 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Fri, 31 Jan 2025 20:54:12 +0500
Subject: [PATCH 111/151] add depthpro to toctree

---
 docs/source/en/_toctree.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 1e02abd1bb76..ff3359628de8 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -651,6 +651,8 @@
         title: Depth Anything
       - local: model_doc/depth_anything_v2
         title: Depth Anything V2
+      - local: model_doc/depth_pro
+        title: DepthPro
       - local: model_doc/deta
         title: DETA
       - local: model_doc/detr

From 20b277de61c908b05a62c9d0be14c4899b2fee90 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Fri, 31 Jan 2025 21:04:58 +0500
Subject: [PATCH 112/151] add residual layer to _no_split_modules

---
 src/transformers/models/depth_pro/modeling_depth_pro.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 7e36d3f8f306..85a866e860d7 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -631,7 +631,7 @@ class DepthProPreTrainedModel(PreTrainedModel):
     main_input_name = "pixel_values"
     supports_gradient_checkpointing = True
     _supports_sdpa = True
-    _no_split_modules = []
+    _no_split_modules = ["DepthProPreActResidualLayer"]
     _keys_to_ignore_on_load_unexpected = ["fov_model.*"]
 
     def _init_weights(self, module):

From ff0e408cb27446e7977e2ffde01bf4d2655063e8 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sat, 1 Feb 2025 18:15:05 +0500
Subject: [PATCH 113/151] architecture rework

---
 .../convert_depth_pro_weights_to_hf.py        |  51 +-
 .../models/depth_pro/modeling_depth_pro.py    | 623 ++++++++++--------
 2 files changed, 384 insertions(+), 290 deletions(-)

diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
index e17e53efe8cc..bddc3114ffec 100644
--- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
+++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
@@ -32,44 +32,44 @@
 ORIGINAL_TO_CONVERTED_KEY_MAPPING = {
 
     # encoder and head
-    r"encoder.(patch|image)_encoder.cls_token":                                 r"depth_pro.encoder.\1_encoder.embeddings.cls_token",
-    r"encoder.(patch|image)_encoder.pos_embed":                                 r"depth_pro.encoder.\1_encoder.embeddings.position_embeddings",
-    r"encoder.(patch|image)_encoder.patch_embed.proj.(weight|bias)":            r"depth_pro.encoder.\1_encoder.embeddings.patch_embeddings.projection.\2",
-    r"encoder.(patch|image)_encoder.blocks.(\d+).norm(\d+).(weight|bias)":      r"depth_pro.encoder.\1_encoder.encoder.layer.\2.norm\3.\4",
-    r"encoder.(patch|image)_encoder.blocks.(\d+).attn.qkv.(weight|bias)":       r"depth_pro.encoder.\1_encoder.encoder.layer.\2.attention.attention.(query|key|value).\3",
-    r"encoder.(patch|image)_encoder.blocks.(\d+).attn.proj.(weight|bias)":      r"depth_pro.encoder.\1_encoder.encoder.layer.\2.attention.output.dense.\3",
-    r"encoder.(patch|image)_encoder.blocks.(\d+).ls(\d+).gamma":                r"depth_pro.encoder.\1_encoder.encoder.layer.\2.layer_scale\3.lambda1",
-    r"encoder.(patch|image)_encoder.blocks.(\d+).mlp.fc(\d+).(weight|bias)":    r"depth_pro.encoder.\1_encoder.encoder.layer.\2.mlp.fc\3.\4",
-    r"encoder.(patch|image)_encoder.norm.(weight|bias)":                        r"depth_pro.encoder.\1_encoder.layernorm.\2",
-    r"encoder.fuse_lowres.(weight|bias)":                                       r"depth_pro.encoder.fuse_image_with_low_res.\1",
+    r"encoder.(patch|image)_encoder.cls_token":                                 r"depth_pro.encoder.\1_encoder.model.embeddings.cls_token",
+    r"encoder.(patch|image)_encoder.pos_embed":                                 r"depth_pro.encoder.\1_encoder.model.embeddings.position_embeddings",
+    r"encoder.(patch|image)_encoder.patch_embed.proj.(weight|bias)":            r"depth_pro.encoder.\1_encoder.model.embeddings.patch_embeddings.projection.\2",
+    r"encoder.(patch|image)_encoder.blocks.(\d+).norm(\d+).(weight|bias)":      r"depth_pro.encoder.\1_encoder.model.encoder.layer.\2.norm\3.\4",
+    r"encoder.(patch|image)_encoder.blocks.(\d+).attn.qkv.(weight|bias)":       r"depth_pro.encoder.\1_encoder.model.encoder.layer.\2.attention.attention.(query|key|value).\3",
+    r"encoder.(patch|image)_encoder.blocks.(\d+).attn.proj.(weight|bias)":      r"depth_pro.encoder.\1_encoder.model.encoder.layer.\2.attention.output.dense.\3",
+    r"encoder.(patch|image)_encoder.blocks.(\d+).ls(\d+).gamma":                r"depth_pro.encoder.\1_encoder.model.encoder.layer.\2.layer_scale\3.lambda1",
+    r"encoder.(patch|image)_encoder.blocks.(\d+).mlp.fc(\d+).(weight|bias)":    r"depth_pro.encoder.\1_encoder.model.encoder.layer.\2.mlp.fc\3.\4",
+    r"encoder.(patch|image)_encoder.norm.(weight|bias)":                        r"depth_pro.encoder.\1_encoder.model.layernorm.\2",
+    r"encoder.fuse_lowres.(weight|bias)":                                       r"depth_pro.neck.fuse_image_with_low_res.\1",
     r"head.(\d+).(weight|bias)":                                                r"head.head.\1.\2",
 
     # fov
-    r"fov.encoder.0.cls_token":                                                 r"fov_model.encoder.embeddings.cls_token",
-    r"fov.encoder.0.pos_embed":                                                 r"fov_model.encoder.embeddings.position_embeddings",
-    r"fov.encoder.0.patch_embed.proj.(weight|bias)":                            r"fov_model.encoder.embeddings.patch_embeddings.projection.\1",
-    r"fov.encoder.0.blocks.(\d+).norm(\d+).(weight|bias)":                      r"fov_model.encoder.encoder.layer.\1.norm\2.\3",
-    r"fov.encoder.0.blocks.(\d+).attn.qkv.(weight|bias)":                       r"fov_model.encoder.encoder.layer.\1.attention.attention.(query|key|value).\2",
-    r"fov.encoder.0.blocks.(\d+).attn.proj.(weight|bias)":                      r"fov_model.encoder.encoder.layer.\1.attention.output.dense.\2",
-    r"fov.encoder.0.blocks.(\d+).ls(\d+).gamma":                                r"fov_model.encoder.encoder.layer.\1.layer_scale\2.lambda1",
-    r"fov.encoder.0.blocks.(\d+).mlp.fc(\d+).(weight|bias)":                    r"fov_model.encoder.encoder.layer.\1.mlp.fc\2.\3",
-    r"fov.encoder.0.norm.(weight|bias)":                                        r"fov_model.encoder.layernorm.\1",
+    r"fov.encoder.0.cls_token":                                                 r"fov_model.fov_encoder.model.embeddings.cls_token",
+    r"fov.encoder.0.pos_embed":                                                 r"fov_model.fov_encoder.model.embeddings.position_embeddings",
+    r"fov.encoder.0.patch_embed.proj.(weight|bias)":                            r"fov_model.fov_encoder.model.embeddings.patch_embeddings.projection.\1",
+    r"fov.encoder.0.blocks.(\d+).norm(\d+).(weight|bias)":                      r"fov_model.fov_encoder.model.encoder.layer.\1.norm\2.\3",
+    r"fov.encoder.0.blocks.(\d+).attn.qkv.(weight|bias)":                       r"fov_model.fov_encoder.model.encoder.layer.\1.attention.attention.(query|key|value).\2",
+    r"fov.encoder.0.blocks.(\d+).attn.proj.(weight|bias)":                      r"fov_model.fov_encoder.model.encoder.layer.\1.attention.output.dense.\2",
+    r"fov.encoder.0.blocks.(\d+).ls(\d+).gamma":                                r"fov_model.fov_encoder.model.encoder.layer.\1.layer_scale\2.lambda1",
+    r"fov.encoder.0.blocks.(\d+).mlp.fc(\d+).(weight|bias)":                    r"fov_model.fov_encoder.model.encoder.layer.\1.mlp.fc\2.\3",
+    r"fov.encoder.0.norm.(weight|bias)":                                        r"fov_model.fov_encoder.model.layernorm.\1",
     r"fov.downsample.(\d+).(weight|bias)":                                      r"fov_model.global_neck.\1.\2",
-    r"fov.encoder.1.(weight|bias)":                                             r"fov_model.encoder_neck.\1",
-    r"fov.head.head.(\d+).(weight|bias)":                                       r"fov_model.head.\1.\2",
+    r"fov.encoder.1.(weight|bias)":                                             r"fov_model.fov_encoder.neck.\1",
+    r"fov.head.head.(\d+).(weight|bias)":                                       r"fov_model.head.layers.\1.\2",
 
     # upsamples
-    r"encoder.upsample_lowres.(weight|bias)":                                   r"depth_pro.encoder.feature_upsample.image_block.layers.0.\1",
+    r"encoder.upsample_lowres.(weight|bias)":                                   r"depth_pro.neck.feature_upsample.image_block.layers.0.\1",
     r"encoder.upsample_latent(\d+).(\d+).(weight|bias)": lambda match: (
-        f"depth_pro.encoder.feature_upsample.intermediate.{1-int(match.group(1))}.layers.{match.group(2)}.{match.group(3)}"
+        f"depth_pro.neck.feature_upsample.intermediate.{1-int(match.group(1))}.layers.{match.group(2)}.{match.group(3)}"
     ),
     r"encoder.upsample(\d+).(\d+).(weight|bias)": lambda match: (
-        f"depth_pro.encoder.feature_upsample.scaled_images.{2-int(match.group(1))}.layers.{match.group(2)}.{match.group(3)}"
+        f"depth_pro.neck.feature_upsample.scaled_images.{2-int(match.group(1))}.layers.{match.group(2)}.{match.group(3)}"
     ),
 
     # projections between encoder and fusion
     r"decoder.convs.(\d+).weight": lambda match: (
-        f"depth_pro.encoder.feature_projection.projections.{4-int(match.group(1))}.weight"
+        f"depth_pro.neck.feature_projection.projections.{4-int(match.group(1))}.weight"
     ),
 
     # fusion stage
@@ -160,7 +160,6 @@ def write_model(
 
     # download and load state_dict from hf repo
     file_path = hf_hub_download(hf_repo_id, "depth_pro.pt")
-    # file_path = "/home/geetu/work/hf/depth_pro/depth_pro.pt" # when you already have the files locally
     loaded = torch.load(file_path, weights_only=True)
 
     # ensure state_dict is in float32
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 85a866e860d7..b9ddc579ef36 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -47,8 +47,8 @@ class DepthProOutput(ModelOutput):
     Args:
         last_hidden_state (`torch.FloatTensor` of shape `(batch_size, n_patches_per_batch, sequence_length, hidden_size)`):
             Sequence of hidden-states at the output of the last layer of the model.
-        features (`List[torch.FloatTensor]`, *optional*:
-            Features from scaled images and hidden_states.
+        features (`Union[torch.FloatTensor, List[torch.FloatTensor]]`, *optional*):
+            Features from encoders. Can be a single feature or a list of features.
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
             one for the output of each layer) of shape `(batch_size, n_patches_per_batch, sequence_length, hidden_size)`.
@@ -63,7 +63,7 @@ class DepthProOutput(ModelOutput):
     """
 
     last_hidden_state: torch.FloatTensor = None
-    features: Optional[List[torch.FloatTensor]] = None
+    features: Union[torch.FloatTensor, List[torch.FloatTensor]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 
@@ -100,138 +100,6 @@ class DepthProDepthEstimatorOutput(ModelOutput):
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 
 
-class DepthProFeatureUpsampleBlock(nn.Module):
-    def __init__(
-        self,
-        config: DepthProConfig,
-        input_dims: int,
-        intermediate_dims: int,
-        output_dims: int,
-        n_upsample_layers: int,
-        use_proj: bool = True,
-        bias: bool = False,
-    ):
-        super().__init__()
-        self.config = config
-        self.layers = nn.ModuleList()
-
-        # create first projection layer
-        if use_proj:
-            proj = nn.Conv2d(
-                in_channels=input_dims,
-                out_channels=intermediate_dims,
-                kernel_size=1,
-                stride=1,
-                padding=0,
-                bias=bias,
-            )
-            self.layers.append(proj)
-
-        # create following upsample layers
-        for i in range(n_upsample_layers):
-            in_channels = intermediate_dims if i == 0 else output_dims
-            layer = nn.ConvTranspose2d(
-                in_channels=in_channels,
-                out_channels=output_dims,
-                kernel_size=2,
-                stride=2,
-                padding=0,
-                bias=bias,
-            )
-            self.layers.append(layer)
-
-    def forward(self, features: torch.Tensor) -> torch.Tensor:
-        for layer in self.layers:
-            features = layer(features)
-        return features
-
-
-class DepthProFeatureUpsample(nn.Module):
-    def __init__(self, config: DepthProConfig):
-        super().__init__()
-        self.config = config
-        self.n_scaled_images = len(self.config.scaled_images_ratios)
-        self.n_intermediate_hooks = len(self.config.intermediate_hook_ids)
-
-        # for image_features
-        self.image_block = DepthProFeatureUpsampleBlock(
-            config=config,
-            input_dims=config.image_model_config.hidden_size,
-            intermediate_dims=config.image_model_config.hidden_size,
-            output_dims=config.scaled_images_feature_dims[0],
-            n_upsample_layers=1,
-            use_proj=False,
-            bias=True,
-        )
-
-        # for scaled_images_features
-        self.scaled_images = nn.ModuleList()
-        for i, feature_dims in enumerate(config.scaled_images_feature_dims):
-            block = DepthProFeatureUpsampleBlock(
-                config=config,
-                input_dims=config.patch_model_config.hidden_size,
-                intermediate_dims=feature_dims,
-                output_dims=feature_dims,
-                n_upsample_layers=1,
-            )
-            self.scaled_images.append(block)
-
-        # for intermediate_features
-        self.intermediate = nn.ModuleList()
-        for i, feature_dims in enumerate(config.intermediate_feature_dims):
-            intermediate_dims = config.fusion_hidden_size if i == 0 else feature_dims
-            block = DepthProFeatureUpsampleBlock(
-                config=config,
-                input_dims=config.patch_model_config.hidden_size,
-                intermediate_dims=intermediate_dims,
-                output_dims=feature_dims,
-                n_upsample_layers=2 + i,
-            )
-            self.intermediate.append(block)
-
-    def forward(self, features: List[torch.Tensor]) -> List[torch.Tensor]:
-        features[0] = self.image_block(features[0])
-
-        for i in range(self.n_scaled_images):
-            features[i + 1] = self.scaled_images[i](features[i + 1])
-
-        for i in range(self.n_intermediate_hooks):
-            features[self.n_scaled_images + i + 1] = self.intermediate[i](features[self.n_scaled_images + i + 1])
-
-        return features
-
-
-class DepthProFeatureProjection(nn.Module):
-    def __init__(self, config: DepthProConfig):
-        super().__init__()
-        self.config = config
-
-        combined_feature_dims = config.scaled_images_feature_dims + config.intermediate_feature_dims
-        self.projections = nn.ModuleList()
-        for i, in_channels in enumerate(combined_feature_dims):
-            if i == len(combined_feature_dims) - 1 and in_channels == config.fusion_hidden_size:
-                # projection for last layer can be ignored if input and output channels already match
-                self.projections.append(nn.Identity())
-            else:
-                self.projections.append(
-                    nn.Conv2d(
-                        in_channels=in_channels,
-                        out_channels=config.fusion_hidden_size,
-                        kernel_size=3,
-                        stride=1,
-                        padding=1,
-                        bias=False,
-                    )
-                )
-
-    def forward(self, features: List[torch.Tensor]) -> List[torch.Tensor]:
-        projected_features = []
-        for i, projection in enumerate(self.projections):
-            upsampled_feature = projection(features[i])
-            projected_features.append(upsampled_feature)
-        return projected_features
-
-
 def split_to_patches(pixel_values: torch.Tensor, patch_size: int, overlap_ratio: float) -> torch.Tensor:
     """Creates Patches from Batch."""
     batch_size, num_channels, height, width = pixel_values.shape
@@ -369,11 +237,10 @@ def reconstruct_feature_maps(
     return features
 
 
-class DepthProEncoder(nn.Module):
+class DepthProPatchEncoder(nn.Module):
     def __init__(self, config: DepthProConfig):
         super().__init__()
         self.config = config
-        self.fusion_hidden_size = config.fusion_hidden_size
 
         self.intermediate_hook_ids = config.intermediate_hook_ids
         self.intermediate_feature_dims = config.intermediate_feature_dims
@@ -382,48 +249,17 @@ def __init__(self, config: DepthProConfig):
         self.scaled_images_feature_dims = config.scaled_images_feature_dims
         self.merge_padding_value = config.merge_padding_value
 
-        self.n_scaled_images = len(self.scaled_images_ratios)
-        self.n_intermediate_hooks = len(self.intermediate_hook_ids)
-
-        # patch encoder
-        self.patch_encoder = AutoModel.from_config(config.patch_model_config)
-
-        # image encoder
-        self.image_encoder = AutoModel.from_config(config.image_model_config)
-
-        # upsample features
-        self.feature_upsample = DepthProFeatureUpsample(config)
-
-        # for STEP 7: fuse low_res and image features
-        self.fuse_image_with_low_res = nn.Conv2d(
-            in_channels=config.scaled_images_feature_dims[0] * 2,
-            out_channels=config.scaled_images_feature_dims[0],
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            bias=True,
-        )
+        self.n_scaled_images = len(config.scaled_images_ratios)
+        self.n_intermediate_hooks = len(config.intermediate_hook_ids)
+        self.out_size = config.image_model_config.image_size // config.image_model_config.patch_size
 
-        # project features
-        self.feature_projection = DepthProFeatureProjection(config)
+        self.model = AutoModel.from_config(config.patch_model_config)
 
     def forward(
         self,
         pixel_values: torch.Tensor,
         head_mask: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-        output_hidden_states: bool = False,
-        return_dict: bool = True,
-    ) -> Union[tuple, DepthProOutput]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if pixel_values.dim() != 4:
-            raise ValueError("Input tensor must have shape (batch_size, num_channels, height, width).")
-
+    ) -> List[torch.Tensor]:
         batch_size, num_channels, height, width = pixel_values.shape
 
         if min(self.scaled_images_ratios) * min(height, width) < self.config.patch_size:
@@ -457,39 +293,23 @@ def forward(
         n_patches_per_scaled_image = [len(i) for i in scaled_images]
         patches = torch.cat(scaled_images[::-1], dim=0)  # -1 as patch encoder expects high res patches first
 
-        # STEP 3: apply patch and image encoder
+        # STEP 3: apply patch encoder
 
-        patch_encodings = self.patch_encoder(
+        encodings = self.model(
             # each patch is processed as a separate batch
             patches,
             head_mask=head_mask,
             # required for intermediate features
             output_hidden_states=self.n_intermediate_hooks > 0,
-            return_dict=return_dict,
         )
 
-        scaled_images_last_hidden_state = torch.split_with_sizes(patch_encodings[0], n_patches_per_scaled_image[::-1])
-        scaled_images_last_hidden_state = scaled_images_last_hidden_state[::-1]
+        scaled_images_last_hidden_state = torch.split_with_sizes(encodings[0], n_patches_per_scaled_image[::-1])
         # -1 (reverse list) as patch encoder returns high res patches first, we need low res first
-
-        # scale the image to patch size for image_encoder
-        image_scaled_to_patch_size = F.interpolate(
-            pixel_values,
-            size=(self.config.patch_size, self.config.patch_size),
-            mode="bilinear",
-            align_corners=False,
-        )
-        image_encodings = self.image_encoder(
-            pixel_values=image_scaled_to_patch_size,
-            head_mask=head_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-        )
+        scaled_images_last_hidden_state = scaled_images_last_hidden_state[::-1]
 
         # calculate base height and width
         # base height and width are the dimensions of the lowest resolution features
-        out_size = torch_int(image_encodings[0].shape[1] ** 0.5)
-        exponent_value = torch_int(math.log2(width / out_size))
+        exponent_value = torch_int(math.log2(width / self.out_size))
         base_height = height // 2**exponent_value
         base_width = width // 2**exponent_value
 
@@ -515,7 +335,7 @@ def forward(
         intermediate_features = []
         for i in range(self.n_intermediate_hooks):
             # +1 to correct index position as hidden_states contain embedding output as well
-            hidden_state = patch_encodings[2][self.intermediate_hook_ids[i] + 1]
+            hidden_state = encodings[2][self.intermediate_hook_ids[i] + 1]
             padding = torch_int(self.merge_padding_value * (1 / self.scaled_images_ratios[-1]))
             output_height = base_height * 2 ** (self.n_scaled_images - 1)
             output_width = base_width * 2 ** (self.n_scaled_images - 1)
@@ -527,39 +347,112 @@ def forward(
             )
             intermediate_features.append(features)
 
-        # STEP 6: get image features - (6) in diagram
+        # STEP 7: combine all features
+        features = [*scaled_images_features, *intermediate_features]
+
+        return features
+
 
-        image_features = reconstruct_feature_maps(
-            image_encodings[0],
+class DepthProImageEncoder(nn.Module):
+    def __init__(self, config: DepthProConfig):
+        super().__init__()
+        self.config = config
+        self.out_size = config.image_model_config.image_size // config.image_model_config.patch_size
+
+        self.model = AutoModel.from_config(config.image_model_config)
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, DepthProOutput]:
+        batch_size, num_channels, height, width = pixel_values.shape
+
+        # scale the image to patch size for image_encoder
+        pixel_values = F.interpolate(
+            pixel_values,
+            size=(self.config.patch_size, self.config.patch_size),
+            mode="bilinear",
+            align_corners=False,
+        )
+        encodings = self.model(
+            pixel_values=pixel_values,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+
+        # calculate base height and width
+        # base height and width are the dimensions of the lowest resolution features
+        exponent_value = torch_int(math.log2(width / self.out_size))
+        base_height = height // 2**exponent_value
+        base_width = width // 2**exponent_value
+
+        features = reconstruct_feature_maps(
+            encodings[0],
             batch_size=batch_size,
             padding=0,
             output_size=(base_height, base_width),
         )
 
-        # STEP 7: combine all features
-        features = [
-            image_features,
-            *scaled_images_features,
-            *intermediate_features,
-        ]
+        if not return_dict:
+            return (encodings[0], features) + encodings[2:]  # ignore last_hidden_state and poooler output
 
-        # STEP 8: upsample features
-        features = self.feature_upsample(features)
+        return DepthProOutput(
+            last_hidden_state=encodings.last_hidden_state,
+            features=features,
+            hidden_states=encodings.hidden_states,
+            attentions=encodings.attentions,
+        )
 
-        # STEP 9: apply fusion
-        # (global features = low res features + image features)
-        # fuses image_features with lowest resolution features as they are of same size
-        global_features = torch.cat((features[1], features[0]), dim=1)
-        global_features = self.fuse_image_with_low_res(global_features)
-        features = [global_features, *features[2:]]
 
-        # STEP 10: project features
-        features = self.feature_projection(features)
+class DepthProEncoder(nn.Module):
+    def __init__(self, config: DepthProConfig):
+        super().__init__()
+        self.config = config
+        self.intermediate_hook_ids = config.intermediate_hook_ids
+        self.intermediate_feature_dims = config.intermediate_feature_dims
+        self.scaled_images_ratios = config.scaled_images_ratios
+        self.scaled_images_overlap_ratios = config.scaled_images_overlap_ratios
+        self.scaled_images_feature_dims = config.scaled_images_feature_dims
+        self.merge_padding_value = config.merge_padding_value
 
-        # STEP 11: return output
+        self.n_scaled_images = len(self.scaled_images_ratios)
+        self.n_intermediate_hooks = len(self.intermediate_hook_ids)
+
+        self.patch_encoder = DepthProPatchEncoder(config)
+        self.image_encoder = DepthProImageEncoder(config)
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, DepthProOutput]:
+        batch_size, num_channels, height, width = pixel_values.shape
+
+        patch_features = self.patch_encoder(
+            pixel_values,
+            head_mask=head_mask,
+        )
+        image_encodings = self.image_encoder(
+            pixel_values,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        image_features = image_encodings[1]  # index 1 contains features
+
+        features = [image_features, *patch_features]
 
         if not return_dict:
-            return (image_encodings[0], features) + image_encodings[2:]  # ignore last_hidden_state and poooler output
+            return (image_encodings[0], features) + image_encodings[2:]
 
         return DepthProOutput(
             last_hidden_state=image_encodings.last_hidden_state,
@@ -569,6 +462,164 @@ def forward(
         )
 
 
+class DepthProFeatureUpsampleBlock(nn.Module):
+    def __init__(
+        self,
+        config: DepthProConfig,
+        input_dims: int,
+        intermediate_dims: int,
+        output_dims: int,
+        n_upsample_layers: int,
+        use_proj: bool = True,
+        bias: bool = False,
+    ):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList()
+
+        # create first projection layer
+        if use_proj:
+            proj = nn.Conv2d(
+                in_channels=input_dims,
+                out_channels=intermediate_dims,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias=bias,
+            )
+            self.layers.append(proj)
+
+        # create following upsample layers
+        for i in range(n_upsample_layers):
+            in_channels = intermediate_dims if i == 0 else output_dims
+            layer = nn.ConvTranspose2d(
+                in_channels=in_channels,
+                out_channels=output_dims,
+                kernel_size=2,
+                stride=2,
+                padding=0,
+                bias=bias,
+            )
+            self.layers.append(layer)
+
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        for layer in self.layers:
+            features = layer(features)
+        return features
+
+
+class DepthProFeatureUpsample(nn.Module):
+    def __init__(self, config: DepthProConfig):
+        super().__init__()
+        self.config = config
+        self.n_scaled_images = len(self.config.scaled_images_ratios)
+        self.n_intermediate_hooks = len(self.config.intermediate_hook_ids)
+
+        # for image_features
+        self.image_block = DepthProFeatureUpsampleBlock(
+            config=config,
+            input_dims=config.image_model_config.hidden_size,
+            intermediate_dims=config.image_model_config.hidden_size,
+            output_dims=config.scaled_images_feature_dims[0],
+            n_upsample_layers=1,
+            use_proj=False,
+            bias=True,
+        )
+
+        # for scaled_images_features
+        self.scaled_images = nn.ModuleList()
+        for i, feature_dims in enumerate(config.scaled_images_feature_dims):
+            block = DepthProFeatureUpsampleBlock(
+                config=config,
+                input_dims=config.patch_model_config.hidden_size,
+                intermediate_dims=feature_dims,
+                output_dims=feature_dims,
+                n_upsample_layers=1,
+            )
+            self.scaled_images.append(block)
+
+        # for intermediate_features
+        self.intermediate = nn.ModuleList()
+        for i, feature_dims in enumerate(config.intermediate_feature_dims):
+            intermediate_dims = config.fusion_hidden_size if i == 0 else feature_dims
+            block = DepthProFeatureUpsampleBlock(
+                config=config,
+                input_dims=config.patch_model_config.hidden_size,
+                intermediate_dims=intermediate_dims,
+                output_dims=feature_dims,
+                n_upsample_layers=2 + i,
+            )
+            self.intermediate.append(block)
+
+    def forward(self, features: List[torch.Tensor]) -> List[torch.Tensor]:
+        features[0] = self.image_block(features[0])
+
+        for i in range(self.n_scaled_images):
+            features[i + 1] = self.scaled_images[i](features[i + 1])
+
+        for i in range(self.n_intermediate_hooks):
+            features[self.n_scaled_images + i + 1] = self.intermediate[i](features[self.n_scaled_images + i + 1])
+
+        return features
+
+
+class DepthProFeatureProjection(nn.Module):
+    def __init__(self, config: DepthProConfig):
+        super().__init__()
+        self.config = config
+
+        combined_feature_dims = config.scaled_images_feature_dims + config.intermediate_feature_dims
+        self.projections = nn.ModuleList()
+        for i, in_channels in enumerate(combined_feature_dims):
+            if i == len(combined_feature_dims) - 1 and in_channels == config.fusion_hidden_size:
+                # projection for last layer can be ignored if input and output channels already match
+                self.projections.append(nn.Identity())
+            else:
+                self.projections.append(
+                    nn.Conv2d(
+                        in_channels=in_channels,
+                        out_channels=config.fusion_hidden_size,
+                        kernel_size=3,
+                        stride=1,
+                        padding=1,
+                        bias=False,
+                    )
+                )
+
+    def forward(self, features: List[torch.Tensor]) -> List[torch.Tensor]:
+        projected_features = []
+        for i, projection in enumerate(self.projections):
+            upsampled_feature = projection(features[i])
+            projected_features.append(upsampled_feature)
+        return projected_features
+
+
+class DepthProNeck(nn.Module):
+    def __init__(self, config: DepthProConfig):
+        super().__init__()
+        self.config = config
+
+        self.feature_upsample = DepthProFeatureUpsample(config)
+        self.fuse_image_with_low_res = nn.Conv2d(
+            in_channels=config.scaled_images_feature_dims[0] * 2,
+            out_channels=config.scaled_images_feature_dims[0],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True,
+        )
+        self.feature_projection = DepthProFeatureProjection(config)
+
+    def forward(self, features: List[torch.Tensor]) -> List[torch.Tensor]:
+        features = self.feature_upsample(features)
+        # global features = low res features + image features
+        global_features = torch.cat((features[1], features[0]), dim=1)
+        global_features = self.fuse_image_with_low_res(global_features)
+        features = [global_features, *features[2:]]
+        features = self.feature_projection(features)
+        return features
+
+
 # General docstring
 _CONFIG_FOR_DOC = "DepthProConfig"
 
@@ -660,22 +711,20 @@ def __init__(self, config):
         super().__init__(config)
         self.config = config
         self.encoder = DepthProEncoder(config)
+        self.neck = DepthProNeck(config)
         # Initialize weights and apply final processing
         self.post_init()
 
     def get_input_embeddings(self):
-        # TODO: return hidden_states from patch_encodings instead of image_encodings
-        # return self.encoder.patch_encoder.embeddings.patch_embeddings
-        return self.encoder.image_encoder.embeddings.patch_embeddings
+        return self.encoder.image_encoder.model.get_input_embeddings()
 
     def _prune_heads(self, heads_to_prune):
         """
         Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
         class PreTrainedModel
         """
-        for layer, heads in heads_to_prune.items():
-            self.encoder.patch_encoder.encoder.layer[layer].attention.prune_heads(heads)
-            self.encoder.image_encoder.encoder.layer[layer].attention.prune_heads(heads)
+        self.encoder.patch_encoder.model._prune_heads(heads_to_prune)
+        self.encoder.image_encoder.model._prune_heads(heads_to_prune)
 
     @add_start_docstrings_to_model_forward(DEPTH_PRO_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
@@ -727,8 +776,18 @@ def forward(
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
         )
+        features = encodings[1]  # index 1 contains features
+        features = self.neck(features)
+
+        if not return_dict:
+            return (encodings[0], features) + encodings[2:]
 
-        return encodings
+        return DepthProOutput(
+            last_hidden_state=encodings.last_hidden_state,
+            features=features,
+            hidden_states=encodings.hidden_states,
+            attentions=encodings.attentions,
+        )
 
 
 # Copied from transformers.models.dpt.modeling_dpt.DPTPreActResidualLayer DPT->DepthPro
@@ -863,25 +922,63 @@ def forward(self, hidden_states: List[torch.Tensor]) -> List[torch.Tensor]:
         return fused_hidden_states
 
 
-class DepthProFOVModel(nn.Module):
+class DepthProFOVEncoder(nn.Module):
     def __init__(self, config: DepthProConfig):
         super().__init__()
         self.config = config
-        self.fusion_hidden_size = config.fusion_hidden_size
+        self.out_size = config.image_model_config.image_size // config.image_model_config.patch_size
 
-        self.out_size = config.fov_model_config.image_size // config.fov_model_config.patch_size
+        self.model = AutoModel.from_config(config.fov_model_config)
+        self.neck = nn.Linear(config.fov_model_config.hidden_size, config.fusion_hidden_size // 2)
 
-        self.encoder = AutoModel.from_config(config.fov_model_config)
-        self.encoder_neck = nn.Linear(config.fov_model_config.hidden_size, self.fusion_hidden_size // 2)
-        self.global_neck = nn.Sequential(
-            nn.Conv2d(self.fusion_hidden_size, self.fusion_hidden_size // 2, kernel_size=3, stride=2, padding=1),
-            nn.ReLU(True),
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        batch_size, num_channels, height, width = pixel_values.shape
+
+        # scale the image to patch size for image_encoder
+        pixel_values = F.interpolate(
+            pixel_values,
+            size=(self.config.patch_size, self.config.patch_size),
+            mode="bilinear",
+            align_corners=False,
         )
+        encodings = self.model(
+            pixel_values=pixel_values,
+            head_mask=head_mask,
+        )
+        hidden_state = encodings[0]
+        hidden_state = self.neck(hidden_state)
+
+        # calculate base height and width
+        # base height and width are the dimensions of the lowest resolution features
+        exponent_value = torch_int(math.log2(width / self.out_size))
+        base_height = height // 2**exponent_value
+        base_width = width // 2**exponent_value
+
+        features = reconstruct_feature_maps(
+            hidden_state,
+            batch_size=batch_size,
+            padding=0,
+            output_size=(base_height, base_width),
+        )
+
+        return features
+
+
+class DepthProFOVHead(nn.Module):
+    def __init__(self, config: DepthProConfig):
+        super().__init__()
+        self.config = config
+        self.fusion_hidden_size = config.fusion_hidden_size
+        self.out_size = config.image_model_config.image_size // config.image_model_config.patch_size
 
         # create initial head layers
-        self.head = nn.Sequential()
+        self.layers = nn.ModuleList()
         for i in range(config.num_fov_head_layers):
-            self.head.append(
+            self.layers.append(
                 nn.Conv2d(
                     math.ceil(self.fusion_hidden_size / 2 ** (i + 1)),
                     math.ceil(self.fusion_hidden_size / 2 ** (i + 2)),
@@ -890,55 +987,53 @@ def __init__(self, config: DepthProConfig):
                     padding=1,
                 )
             )
-            self.head.append(nn.ReLU(True))
+            self.layers.append(nn.ReLU(True))
         # calculate expected shapes to finally generate a scalar output from final head layer
         final_in_channels = math.ceil(self.fusion_hidden_size / 2 ** (config.num_fov_head_layers + 1))
         final_kernal_size = torch_int((self.out_size - 1) / 2**config.num_fov_head_layers + 1)
-        self.head.append(
+        self.layers.append(
             nn.Conv2d(
                 in_channels=final_in_channels, out_channels=1, kernel_size=final_kernal_size, stride=1, padding=0
             )
         )
 
-    def forward(
-        self,
-        pixel_values: torch.Tensor,
-        global_features: torch.Tensor,
-        head_mask: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        batch_size, num_channels, height, width = pixel_values.shape
-
-        image_scaled_to_patch_size = F.interpolate(
-            pixel_values,
-            size=(self.config.patch_size, self.config.patch_size),
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        features = F.interpolate(
+            features,
+            size=(self.out_size, self.out_size),
             mode="bilinear",
             align_corners=False,
         )
-        encodings = self.encoder(
-            image_scaled_to_patch_size,
-            head_mask=head_mask,
-        )
-        hidden_state = encodings[0]
-        hidden_state = self.encoder_neck(hidden_state)
+        for layer in self.layers:
+            features = layer(features)
+        return features
 
-        fov_features = reconstruct_feature_maps(
-            hidden_state,
-            batch_size=batch_size,
-            padding=0,
-            output_size=(self.out_size, self.out_size),
+
+class DepthProFOVModel(nn.Module):
+    def __init__(self, config: DepthProConfig):
+        super().__init__()
+        self.config = config
+        self.fusion_hidden_size = config.fusion_hidden_size
+
+        self.fov_encoder = DepthProFOVEncoder(config)
+        self.global_neck = nn.Sequential(
+            nn.Conv2d(self.fusion_hidden_size, self.fusion_hidden_size // 2, kernel_size=3, stride=2, padding=1),
+            nn.ReLU(True),
         )
+        self.head = DepthProFOVHead(config)
 
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        global_features: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        fov_features = self.fov_encoder(pixel_values, head_mask)
         global_features = self.global_neck(global_features)
-        global_features = F.interpolate(
-            global_features,
-            size=(self.out_size, self.out_size),
-            mode="bilinear",
-            align_corners=False,
-        )
 
         fov_features = fov_features + global_features
         fov_output = self.head(fov_features)
-        fov_output = fov_output.reshape(batch_size)
+        fov_output = fov_output.squeeze()
 
         return fov_output
 

From 1522c530ef979894b39377118ae1fe4516e6b096 Mon Sep 17 00:00:00 2001
From: Armaghan Shakir <raoarmaghanshakir040@gmail.com>
Date: Sat, 1 Feb 2025 19:42:28 +0500
Subject: [PATCH 114/151] Update
 src/transformers/models/depth_pro/image_processing_depth_pro.py

Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>
---
 src/transformers/models/depth_pro/image_processing_depth_pro.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py
index 8b7c84b71943..60bea8460cb4 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py
@@ -20,7 +20,7 @@
 
 
 if TYPE_CHECKING:
-    from ...modeling_outputs import DepthProDepthEstimatorOutput
+    from .modeling_depth_pro import DepthProDepthEstimatorOutput
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
 from ...image_transforms import to_channel_dimension_format

From 131817ad1ea403cee411fbdce26ecef32df3b39a Mon Sep 17 00:00:00 2001
From: Armaghan Shakir <raoarmaghanshakir040@gmail.com>
Date: Sat, 1 Feb 2025 19:42:40 +0500
Subject: [PATCH 115/151] Update
 src/transformers/models/depth_pro/image_processing_depth_pro_fast.py

Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>
---
 .../models/depth_pro/image_processing_depth_pro_fast.py        | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
index 482755a83f52..15ac15a90ddb 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
@@ -37,7 +37,8 @@
 
 
 if TYPE_CHECKING:
-    from ...modeling_outputs import DepthProDepthEstimatorOutput
+    from .modeling_depth_pro import DepthProDepthEstimatorOutput
+
 logger = logging.get_logger(__name__)
 
 

From 72a1f0cacc253cf7068293d2f4bba74e603a94d1 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sat, 1 Feb 2025 19:56:49 +0500
Subject: [PATCH 116/151] update docs

---
 docs/source/en/model_doc/depth_pro.md | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

diff --git a/docs/source/en/model_doc/depth_pro.md b/docs/source/en/model_doc/depth_pro.md
index 20b526dda76e..a2076ec8cb75 100644
--- a/docs/source/en/model_doc/depth_pro.md
+++ b/docs/source/en/model_doc/depth_pro.md
@@ -26,7 +26,7 @@ The abstract from the paper is the following:
 
 *We present a foundation model for zero-shot metric monocular depth estimation. Our model, Depth Pro, synthesizes high-resolution depth maps with unparalleled sharpness and high-frequency details. The predictions are metric, with absolute scale, without relying on the availability of metadata such as camera intrinsics. And the model is fast, producing a 2.25-megapixel depth map in 0.3 seconds on a standard GPU. These characteristics are enabled by a number of technical contributions, including an efficient multi-scale vision transformer for dense prediction, a training protocol that combines real and synthetic datasets to achieve high metric accuracy alongside fine boundary tracing, dedicated evaluation metrics for boundary accuracy in estimated depth maps, and state-of-the-art focal length estimation from a single image. Extensive experiments analyze specific design choices and demonstrate that Depth Pro outperforms prior work along multiple dimensions.*
 
-<img src="https://raw.githubusercontent.com/apple/ml-depth-pro/main/data/depth-pro-teaser.jpg"
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/depth_pro_teaser.png"
 alt="drawing" width="600"/>
 
 <small> DepthPro Outputs. Taken from the <a href="https://github.com/apple/ml-depth-pro" target="_blank">official code</a>. </small>
@@ -113,19 +113,6 @@ Or set `use_fov_model=True` when initializing the model, which overrides the val
 >>> model = DepthProForDepthEstimation(config, use_fov_model=True)
 ```
 
-### Image Resolution and Aspect Ratio
-
-The network can process images of different resolutions and aspect ratios and the predicted depth size can be calculated using the following formula:
-
-$\text{Predicted Depth Size} = \frac{2^{N+1} \cdot S}{P}$
-
-Where:
-- $N = \text{len}(\text{intermediate\_hook\_ids}) + \text{len}(\text{scaled\_images\_ratios})$
-- $S = \text{image\_model\_config.image\_size}$
-- $P = \text{image\_model\_config.patch\_size}$
-
-The aspect ratio of the raw predicted depth is maintained as the aspect ratio of the input image.
-
 ### Using Scaled Dot Product Attention (SDPA)
 
 PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function 

From aed7e3dacaa8866a292e87767596d165bdd76f01 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sat, 1 Feb 2025 20:17:48 +0500
Subject: [PATCH 117/151] improve merge_patches

---
 .../models/depth_pro/modeling_depth_pro.py      | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index b9ddc579ef36..326ab7296482 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -177,18 +177,27 @@ def merge_patches(patches: torch.Tensor, batch_size: int, padding: int) -> torch
             boxes_in_row = []
             for w in range(sqrt_n_patches_per_batch):
                 box = patches[batch_size * i : batch_size * (i + 1)]
+
+                # collect paddings
+                paddings = [0, 0, 0, 0]
                 if h != 0:
                     # remove pad from height if box is not at top border
-                    box = box[..., padding:, :]
+                    paddings[0] = padding
                 if w != 0:
                     # remove pad from width if box is not at left border
-                    box = box[..., :, padding:]
+                    paddings[2] = padding
                 if h != sqrt_n_patches_per_batch - 1:
                     # remove pad from height if box is not at bottom border
-                    box = box[..., : box.shape[-2] - padding, :]
+                    paddings[1] = padding
                 if w != sqrt_n_patches_per_batch - 1:
                     # remove pad from width if box is not at right border
-                    box = box[..., :, : box.shape[-1] - padding]
+                    paddings[3] = padding
+
+                # remove paddings
+                _, _, box_h, box_w = box.shape
+                pad_top, pad_bottom, pad_left, pad_right = paddings
+                box = box[:, :, pad_top:box_h - pad_bottom, pad_left:box_w - pad_right]
+
                 boxes_in_row.append(box)
                 i += 1
             boxes_in_row = torch.cat(boxes_in_row, dim=-1)

From 405bee3de4850a0688e6b7466223a4da6f98ef86 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sat, 1 Feb 2025 20:18:10 +0500
Subject: [PATCH 118/151] use flatten with fov_output

---
 src/transformers/models/depth_pro/modeling_depth_pro.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 326ab7296482..f5f741d3d758 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -1042,7 +1042,7 @@ def forward(
 
         fov_features = fov_features + global_features
         fov_output = self.head(fov_features)
-        fov_output = fov_output.squeeze()
+        fov_output = fov_output.flatten()
 
         return fov_output
 

From a8528da17b77948e4c226e9f5c5881c163155469 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Sat, 1 Feb 2025 20:19:41 +0500
Subject: [PATCH 119/151] ruff formatting

---
 src/transformers/models/depth_pro/modeling_depth_pro.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index f5f741d3d758..b8cca868e8d1 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -196,7 +196,7 @@ def merge_patches(patches: torch.Tensor, batch_size: int, padding: int) -> torch
                 # remove paddings
                 _, _, box_h, box_w = box.shape
                 pad_top, pad_bottom, pad_left, pad_right = paddings
-                box = box[:, :, pad_top:box_h - pad_bottom, pad_left:box_w - pad_right]
+                box = box[:, :, pad_top : box_h - pad_bottom, pad_left : box_w - pad_right]
 
                 boxes_in_row.append(box)
                 i += 1

From 31383e12f45a9bede3e73fad3e3ebd862a1492b5 Mon Sep 17 00:00:00 2001
From: Armaghan Shakir <raoarmaghanshakir040@gmail.com>
Date: Mon, 3 Feb 2025 18:59:41 +0500
Subject: [PATCH 120/151] update resources section in docs

Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>
---
 docs/source/en/model_doc/depth_pro.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/model_doc/depth_pro.md b/docs/source/en/model_doc/depth_pro.md
index a2076ec8cb75..9a18cfc8735d 100644
--- a/docs/source/en/model_doc/depth_pro.md
+++ b/docs/source/en/model_doc/depth_pro.md
@@ -142,6 +142,8 @@ On a local benchmark (A100-40GB, PyTorch 2.3.0, OS Ubuntu 22.04) with `float32`
 
 ## Resources
 
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DepthPro:
+
 - Research Paper: [Depth Pro: Sharp Monocular Metric Depth in Less Than a Second](https://arxiv.org/pdf/2410.02073)
 - Official Implementation: [apple/ml-depth-pro](https://github.com/apple/ml-depth-pro)
 - DepthPro Inference Notebook: [DepthPro Inference](https://github.com/qubvel/transformers-notebooks/blob/main/notebooks/DepthPro_inference.ipynb)
@@ -149,8 +151,6 @@ On a local benchmark (A100-40GB, PyTorch 2.3.0, OS Ubuntu 22.04) with `float32`
     - Read blog on Medium: [Depth Pro: Beyond Depth](https://medium.com/@raoarmaghanshakir040/depth-pro-beyond-depth-9d822fc557ba)
     - Code on Github: [geetu040/depthpro-beyond-depth](https://github.com/geetu040/depthpro-beyond-depth)
 
-<!-- A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DepthPro. -->
-
 If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
 
 ## DepthProConfig

From 641cb841507d80e2bc122899498d7b33280270e2 Mon Sep 17 00:00:00 2001
From: Armaghan Shakir <raoarmaghanshakir040@gmail.com>
Date: Mon, 3 Feb 2025 19:00:52 +0500
Subject: [PATCH 121/151] fix typo "final_kernal_size"

Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>
---
 src/transformers/models/depth_pro/modeling_depth_pro.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index b8cca868e8d1..5e481045fed2 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -999,10 +999,10 @@ def __init__(self, config: DepthProConfig):
             self.layers.append(nn.ReLU(True))
         # calculate expected shapes to finally generate a scalar output from final head layer
         final_in_channels = math.ceil(self.fusion_hidden_size / 2 ** (config.num_fov_head_layers + 1))
-        final_kernal_size = torch_int((self.out_size - 1) / 2**config.num_fov_head_layers + 1)
+        final_kernel_size = torch_int((self.out_size - 1) / 2**config.num_fov_head_layers + 1)
         self.layers.append(
             nn.Conv2d(
-                in_channels=final_in_channels, out_channels=1, kernel_size=final_kernal_size, stride=1, padding=0
+                in_channels=final_in_channels, out_channels=1, kernel_size=final_kernel_size, stride=1, padding=0
             )
         )
 

From 6af8a1169d879c5296b82008cf08f7ebb7852354 Mon Sep 17 00:00:00 2001
From: Armaghan Shakir <raoarmaghanshakir040@gmail.com>
Date: Mon, 3 Feb 2025 19:02:24 +0500
Subject: [PATCH 122/151] fix output typehint for DepthProDepthEstimator

Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>
---
 src/transformers/models/depth_pro/modeling_depth_pro.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 5e481045fed2..5cd67b117060 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -1114,7 +1114,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-    ) -> Union[Tuple[torch.Tensor]]:
+    ) -> Union[Tuple[torch.Tensor], DepthProDepthEstimatorOutput]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
             Ground truth depth estimation maps for computing the loss.

From abd5307c7280a309d6e3b3ed09cd3d673dec4ad6 Mon Sep 17 00:00:00 2001
From: Armaghan Shakir <raoarmaghanshakir040@gmail.com>
Date: Mon, 3 Feb 2025 19:03:03 +0500
Subject: [PATCH 123/151] residual operation in 2 steps

Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>
---
 src/transformers/models/depth_pro/modeling_depth_pro.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 5cd67b117060..3c6039030ca8 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -883,11 +883,11 @@ def __init__(self, config: DepthProConfig, use_deconv: bool = True):
             )
 
         self.projection = nn.Conv2d(config.fusion_hidden_size, config.fusion_hidden_size, kernel_size=1, bias=True)
-        self.skip_add = nn.quantized.FloatFunctional()
 
     def forward(self, hidden_state: torch.Tensor, residual: Optional[torch.Tensor] = None) -> torch.Tensor:
         if residual is not None:
-            hidden_state = self.skip_add.add(hidden_state, self.residual_layer1(residual))
+            residual = self.residual_layer1(residual)
+            hidden_state = hidden_state + residual
 
         hidden_state = self.residual_layer2(hidden_state)
         if self.use_deconv:

From 8dc2751cbe3600177a4564117e79384132a0a052 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Mon, 3 Feb 2025 19:15:30 +0500
Subject: [PATCH 124/151] use image_size instead of global patch_size in
 interpolation

---
 .../models/depth_pro/modeling_depth_pro.py             | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 3c6039030ca8..c270ecd0018b 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -380,10 +380,11 @@ def forward(
     ) -> Union[tuple, DepthProOutput]:
         batch_size, num_channels, height, width = pixel_values.shape
 
-        # scale the image to patch size for image_encoder
+        # scale the image for image_encoder
+        size = self.config.image_model_config.image_size
         pixel_values = F.interpolate(
             pixel_values,
-            size=(self.config.patch_size, self.config.patch_size),
+            size=(size, size),
             mode="bilinear",
             align_corners=False,
         )
@@ -947,10 +948,11 @@ def forward(
     ) -> torch.Tensor:
         batch_size, num_channels, height, width = pixel_values.shape
 
-        # scale the image to patch size for image_encoder
+        # scale the image for fov_encoder
+        size = self.config.fov_model_config.image_size
         pixel_values = F.interpolate(
             pixel_values,
-            size=(self.config.patch_size, self.config.patch_size),
+            size=(size, size),
             mode="bilinear",
             align_corners=False,
         )

From 2f88694b3771478cfe1d9393fcc1b10977962b2c Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Mon, 3 Feb 2025 20:54:56 +0500
Subject: [PATCH 125/151] replace all Sequential with ModuleList

---
 .../models/depth_pro/modeling_depth_pro.py    | 20 +++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index c270ecd0018b..888e9d6dce39 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -1027,10 +1027,10 @@ def __init__(self, config: DepthProConfig):
         self.fusion_hidden_size = config.fusion_hidden_size
 
         self.fov_encoder = DepthProFOVEncoder(config)
-        self.global_neck = nn.Sequential(
+        self.global_neck = nn.ModuleList([
             nn.Conv2d(self.fusion_hidden_size, self.fusion_hidden_size // 2, kernel_size=3, stride=2, padding=1),
             nn.ReLU(True),
-        )
+        ])
         self.head = DepthProFOVHead(config)
 
     def forward(
@@ -1040,7 +1040,9 @@ def forward(
         head_mask: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         fov_features = self.fov_encoder(pixel_values, head_mask)
-        global_features = self.global_neck(global_features)
+
+        for layer in self.global_neck:
+            global_features = layer(global_features)
 
         fov_features = fov_features + global_features
         fov_output = self.head(fov_features)
@@ -1062,7 +1064,7 @@ def __init__(self, config):
         self.config = config
 
         features = config.fusion_hidden_size
-        self.head = nn.Sequential(
+        self.head = nn.ModuleList([
             nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1),
             nn.ConvTranspose2d(
                 in_channels=features // 2, out_channels=features // 2, kernel_size=2, stride=2, padding=0, bias=True
@@ -1071,11 +1073,13 @@ def __init__(self, config):
             nn.ReLU(True),
             nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
             nn.ReLU(),
-        )
+        ])
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        for layer in self.head:
+            hidden_states = layer(hidden_states)
 
-    def forward(self, hidden_states: List[torch.Tensor]) -> torch.Tensor:
-        predicted_depth = self.head(hidden_states)
-        predicted_depth = predicted_depth.squeeze(dim=1)
+        predicted_depth = hidden_states.squeeze(dim=1)
         return predicted_depth
 
 

From 208ee26b9ef34577a1d4d18dd85a65b7a6a4556a Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Mon, 3 Feb 2025 21:34:50 +0500
Subject: [PATCH 126/151] update fov

---
 docs/source/en/model_doc/depth_pro.md          |  3 ++-
 .../depth_pro/image_processing_depth_pro.py    | 18 ++++++++++--------
 .../image_processing_depth_pro_fast.py         | 18 ++++++++++--------
 .../models/depth_pro/modeling_depth_pro.py     | 10 +++++++---
 .../depth_pro/test_modeling_depth_pro.py       |  8 ++++----
 5 files changed, 33 insertions(+), 24 deletions(-)

diff --git a/docs/source/en/model_doc/depth_pro.md b/docs/source/en/model_doc/depth_pro.md
index 9a18cfc8735d..a701497caea8 100644
--- a/docs/source/en/model_doc/depth_pro.md
+++ b/docs/source/en/model_doc/depth_pro.md
@@ -58,7 +58,8 @@ The DepthPro model processes an input image by first downsampling it at multiple
 ...     outputs, target_sizes=[(image.height, image.width)],
 ... )
 
->>> fov = post_processed_output[0]["fov"]
+>>> field_of_view = post_processed_output[0]["field_of_view"]
+>>> focal_length = post_processed_output[0]["focal_length"]
 >>> depth = post_processed_output[0]["predicted_depth"]
 >>> depth = (depth - depth.min()) / depth.max()
 >>> depth = depth * 255.
diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py
index 60bea8460cb4..9a9568fdde05 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py
@@ -337,9 +337,9 @@ def post_process_depth_estimation(
         target_sizes: Optional[Union[TensorType, List[Tuple[int, int]], None]] = None,
     ) -> Dict[str, List[TensorType]]:
         """
-        Post-processes the raw depth predictions from the model to generate final depth predictions and optionally
-        resizes them to specified target sizes. This function supports scaling based on the field of view (FoV)
-        and adjusts depth values accordingly.
+        Post-processes the raw depth predictions from the model to generate
+        final depth predictions which is caliberated using the field of view if provided
+        and resized to specified target sizes if provided.
 
         Args:
             outputs ([`DepthProDepthEstimatorOutput`]):
@@ -351,7 +351,7 @@ def post_process_depth_estimation(
 
         Returns:
             `List[Dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth
-            predictions.
+            predictions, and field of view (degrees) and focal length (pixels) if `field_of_view` is given in `outputs`.
 
         Raises:
             `ValueError`:
@@ -360,7 +360,7 @@ def post_process_depth_estimation(
         requires_backends(self, "torch")
 
         predicted_depth = outputs.predicted_depth
-        fov = outputs.fov
+        fov = outputs.field_of_view
 
         batch_size = len(predicted_depth)
 
@@ -373,12 +373,13 @@ def post_process_depth_estimation(
         fov = [None] * batch_size if fov is None else fov
         target_sizes = [None] * batch_size if target_sizes is None else target_sizes
         for depth, fov_value, target_size in zip(predicted_depth, fov, target_sizes):
+            focal_length = None
             if target_size is not None:
                 # scale image w.r.t fov
                 if fov_value is not None:
                     width = target_size[1]
-                    fov_value = 0.5 * width / torch.tan(0.5 * torch.deg2rad(fov_value))
-                    depth = depth * width / fov_value
+                    focal_length = 0.5 * width / torch.tan(0.5 * torch.deg2rad(fov_value))
+                    depth = depth * width / focal_length
 
                 # interpolate
                 depth = torch.nn.functional.interpolate(
@@ -395,7 +396,8 @@ def post_process_depth_estimation(
             results.append(
                 {
                     "predicted_depth": depth,
-                    "fov": fov_value,
+                    "field_of_view": fov_value,
+                    "focal_length": focal_length,
                 }
             )
 
diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
index 15ac15a90ddb..a56ae831960d 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
@@ -317,9 +317,9 @@ def post_process_depth_estimation(
         target_sizes: Optional[Union[TensorType, List[Tuple[int, int]], None]] = None,
     ) -> Dict[str, List[TensorType]]:
         """
-        Post-processes the raw depth predictions from the model to generate final depth predictions and optionally
-        resizes them to specified target sizes. This function supports scaling based on the field of view (FoV)
-        and adjusts depth values accordingly.
+        Post-processes the raw depth predictions from the model to generate
+        final depth predictions which is caliberated using the field of view if provided
+        and resized to specified target sizes if provided.
 
         Args:
             outputs ([`DepthProDepthEstimatorOutput`]):
@@ -331,7 +331,7 @@ def post_process_depth_estimation(
 
         Returns:
             `List[Dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth
-            predictions.
+            predictions, and field of view (degrees) and focal length (pixels) if `field_of_view` is given in `outputs`.
 
         Raises:
             `ValueError`:
@@ -340,7 +340,7 @@ def post_process_depth_estimation(
         requires_backends(self, "torch")
 
         predicted_depth = outputs.predicted_depth
-        fov = outputs.fov
+        fov = outputs.field_of_view
 
         batch_size = len(predicted_depth)
 
@@ -353,12 +353,13 @@ def post_process_depth_estimation(
         fov = [None] * batch_size if fov is None else fov
         target_sizes = [None] * batch_size if target_sizes is None else target_sizes
         for depth, fov_value, target_size in zip(predicted_depth, fov, target_sizes):
+            focal_length = None
             if target_size is not None:
                 # scale image w.r.t fov
                 if fov_value is not None:
                     width = target_size[1]
-                    fov_value = 0.5 * width / torch.tan(0.5 * torch.deg2rad(fov_value))
-                    depth = depth * width / fov_value
+                    focal_length = 0.5 * width / torch.tan(0.5 * torch.deg2rad(fov_value))
+                    depth = depth * width / focal_length
 
                 # interpolate
                 depth = torch.nn.functional.interpolate(
@@ -375,7 +376,8 @@ def post_process_depth_estimation(
             results.append(
                 {
                     "predicted_depth": depth,
-                    "fov": fov_value,
+                    "field_of_view": fov_value,
+                    "focal_length": focal_length,
                 }
             )
 
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 888e9d6dce39..61694940b6da 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -78,7 +78,7 @@ class DepthProDepthEstimatorOutput(ModelOutput):
             Classification (or regression if config.num_labels==1) loss.
         predicted_depth (`torch.FloatTensor` of shape `(batch_size, height, width)`):
             Predicted depth for each pixel.
-        fov (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned when `use_fov_model` is provided):
+        field_of_view (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned when `use_fov_model` is provided):
             Field of View Scaler.
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
@@ -95,7 +95,7 @@ class DepthProDepthEstimatorOutput(ModelOutput):
 
     loss: Optional[torch.FloatTensor] = None
     predicted_depth: torch.FloatTensor = None
-    fov: Optional[torch.FloatTensor] = None
+    field_of_view: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 
@@ -1153,6 +1153,10 @@ def forward(
         ...     outputs, target_sizes=[(image.height, image.width)],
         ... )
 
+        >>> # get the field of view (fov) predictions
+        >>> field_of_view = post_processed_output[0]["field_of_view"]
+        >>> focal_length = post_processed_output[0]["focal_length"]
+
         >>> # visualize the prediction
         >>> predicted_depth = post_processed_output[0]["predicted_depth"]
         >>> depth = predicted_depth * 255 / predicted_depth.max()
@@ -1198,7 +1202,7 @@ def forward(
         return DepthProDepthEstimatorOutput(
             loss=loss,
             predicted_depth=predicted_depth,
-            fov=fov,
+            field_of_view=fov,
             hidden_states=depth_pro_outputs.hidden_states,
             attentions=depth_pro_outputs.attentions,
         )
diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py
index 2f728ada14df..1e4ceadbd4eb 100644
--- a/tests/models/depth_pro/test_modeling_depth_pro.py
+++ b/tests/models/depth_pro/test_modeling_depth_pro.py
@@ -167,8 +167,8 @@ def create_and_check_for_fov(self, config, pixel_values, labels):
         row_pixel_values = pixel_values[:1]
 
         with torch.no_grad():
-            model_batched_output_fov = model(batched_pixel_values).fov
-            model_row_output_fov = model(row_pixel_values).fov
+            model_batched_output_fov = model(batched_pixel_values).field_of_view
+            model_row_output_fov = model(row_pixel_values).field_of_view
 
         # check if fov is returned
         self.parent.assertIsNotNone(model_batched_output_fov)
@@ -372,10 +372,10 @@ def test_inference_depth_estimation(self):
 
         # verify the predicted fov
         expected_shape = torch.Size((1,))
-        self.assertEqual(outputs.fov.shape, expected_shape)
+        self.assertEqual(outputs.field_of_view.shape, expected_shape)
 
         expected_slice = torch.tensor([47.2459]).to(torch_device)
-        torch.testing.assert_close(outputs.fov, expected_slice, atol=1e-4, rtol=1e-4)
+        torch.testing.assert_close(outputs.field_of_view, expected_slice, atol=1e-4, rtol=1e-4)
 
     def test_post_processing_depth_estimation(self):
         model_path = "geetu040/DepthPro"

From bc63511b770e207b0b03690047e7a2c04cfe54ff Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Mon, 3 Feb 2025 22:19:56 +0500
Subject: [PATCH 127/151] update heads

---
 .../models/depth_pro/modeling_depth_pro.py         | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 61694940b6da..1a7476b5f22f 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -1027,10 +1027,8 @@ def __init__(self, config: DepthProConfig):
         self.fusion_hidden_size = config.fusion_hidden_size
 
         self.fov_encoder = DepthProFOVEncoder(config)
-        self.global_neck = nn.ModuleList([
-            nn.Conv2d(self.fusion_hidden_size, self.fusion_hidden_size // 2, kernel_size=3, stride=2, padding=1),
-            nn.ReLU(True),
-        ])
+        self.conv = nn.Conv2d(self.fusion_hidden_size, self.fusion_hidden_size // 2, kernel_size=3, stride=2, padding=1)
+        self.activation = nn.ReLU(inplace=True)
         self.head = DepthProFOVHead(config)
 
     def forward(
@@ -1041,8 +1039,8 @@ def forward(
     ) -> torch.Tensor:
         fov_features = self.fov_encoder(pixel_values, head_mask)
 
-        for layer in self.global_neck:
-            global_features = layer(global_features)
+        global_features = self.conv(global_features)
+        global_features = self.activation(global_features)
 
         fov_features = fov_features + global_features
         fov_output = self.head(fov_features)
@@ -1064,7 +1062,7 @@ def __init__(self, config):
         self.config = config
 
         features = config.fusion_hidden_size
-        self.head = nn.ModuleList([
+        self.layers = nn.ModuleList([
             nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1),
             nn.ConvTranspose2d(
                 in_channels=features // 2, out_channels=features // 2, kernel_size=2, stride=2, padding=0, bias=True
@@ -1076,7 +1074,7 @@ def __init__(self, config):
         ])
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        for layer in self.head:
+        for layer in self.layers:
             hidden_states = layer(hidden_states)
 
         predicted_depth = hidden_states.squeeze(dim=1)

From e33a531d2f88460302de13693ab1dbc47a73a9aa Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Mon, 3 Feb 2025 22:54:03 +0500
Subject: [PATCH 128/151] fix and update conversion script for heads

---
 .../depth_pro/convert_depth_pro_weights_to_hf.py       | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
index bddc3114ffec..ddb0b9bd9724 100644
--- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
+++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
@@ -31,7 +31,7 @@
 # fmt: off
 ORIGINAL_TO_CONVERTED_KEY_MAPPING = {
 
-    # encoder and head
+    # encoder
     r"encoder.(patch|image)_encoder.cls_token":                                 r"depth_pro.encoder.\1_encoder.model.embeddings.cls_token",
     r"encoder.(patch|image)_encoder.pos_embed":                                 r"depth_pro.encoder.\1_encoder.model.embeddings.position_embeddings",
     r"encoder.(patch|image)_encoder.patch_embed.proj.(weight|bias)":            r"depth_pro.encoder.\1_encoder.model.embeddings.patch_embeddings.projection.\2",
@@ -42,7 +42,6 @@
     r"encoder.(patch|image)_encoder.blocks.(\d+).mlp.fc(\d+).(weight|bias)":    r"depth_pro.encoder.\1_encoder.model.encoder.layer.\2.mlp.fc\3.\4",
     r"encoder.(patch|image)_encoder.norm.(weight|bias)":                        r"depth_pro.encoder.\1_encoder.model.layernorm.\2",
     r"encoder.fuse_lowres.(weight|bias)":                                       r"depth_pro.neck.fuse_image_with_low_res.\1",
-    r"head.(\d+).(weight|bias)":                                                r"head.head.\1.\2",
 
     # fov
     r"fov.encoder.0.cls_token":                                                 r"fov_model.fov_encoder.model.embeddings.cls_token",
@@ -54,9 +53,12 @@
     r"fov.encoder.0.blocks.(\d+).ls(\d+).gamma":                                r"fov_model.fov_encoder.model.encoder.layer.\1.layer_scale\2.lambda1",
     r"fov.encoder.0.blocks.(\d+).mlp.fc(\d+).(weight|bias)":                    r"fov_model.fov_encoder.model.encoder.layer.\1.mlp.fc\2.\3",
     r"fov.encoder.0.norm.(weight|bias)":                                        r"fov_model.fov_encoder.model.layernorm.\1",
-    r"fov.downsample.(\d+).(weight|bias)":                                      r"fov_model.global_neck.\1.\2",
+    r"fov.downsample.0.(weight|bias)":                                          r"fov_model.conv.\1",
     r"fov.encoder.1.(weight|bias)":                                             r"fov_model.fov_encoder.neck.\1",
-    r"fov.head.head.(\d+).(weight|bias)":                                       r"fov_model.head.layers.\1.\2",
+    r"fov.head.(\d+).(weight|bias)":                                            r"fov_model.head.layers.\1.\2",
+
+    # head
+    r"head.(\d+).(weight|bias)":                                                r"head.layers.\1.\2",
 
     # upsamples
     r"encoder.upsample_lowres.(weight|bias)":                                   r"depth_pro.neck.feature_upsample.image_block.layers.0.\1",

From 8c0e81a975a9d161957b53a9facc2f4b53476107 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Mon, 3 Feb 2025 22:58:41 +0500
Subject: [PATCH 129/151] ruff formatting

---
 .../models/depth_pro/modeling_depth_pro.py    | 31 ++++++++++++-------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 1a7476b5f22f..26384139c267 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -1027,7 +1027,9 @@ def __init__(self, config: DepthProConfig):
         self.fusion_hidden_size = config.fusion_hidden_size
 
         self.fov_encoder = DepthProFOVEncoder(config)
-        self.conv = nn.Conv2d(self.fusion_hidden_size, self.fusion_hidden_size // 2, kernel_size=3, stride=2, padding=1)
+        self.conv = nn.Conv2d(
+            self.fusion_hidden_size, self.fusion_hidden_size // 2, kernel_size=3, stride=2, padding=1
+        )
         self.activation = nn.ReLU(inplace=True)
         self.head = DepthProFOVHead(config)
 
@@ -1062,16 +1064,23 @@ def __init__(self, config):
         self.config = config
 
         features = config.fusion_hidden_size
-        self.layers = nn.ModuleList([
-            nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1),
-            nn.ConvTranspose2d(
-                in_channels=features // 2, out_channels=features // 2, kernel_size=2, stride=2, padding=0, bias=True
-            ),
-            nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1),
-            nn.ReLU(True),
-            nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
-            nn.ReLU(),
-        ])
+        self.layers = nn.ModuleList(
+            [
+                nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1),
+                nn.ConvTranspose2d(
+                    in_channels=features // 2,
+                    out_channels=features // 2,
+                    kernel_size=2,
+                    stride=2,
+                    padding=0,
+                    bias=True,
+                ),
+                nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1),
+                nn.ReLU(True),
+                nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
+                nn.ReLU(),
+            ]
+        )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         for layer in self.layers:

From 524dda6f7a03a509ca4c66fbcdd255bc3eed158a Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Mon, 3 Feb 2025 23:24:24 +0500
Subject: [PATCH 130/151] remove float32 conversion

---
 .../models/depth_pro/convert_depth_pro_weights_to_hf.py       | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
index ddb0b9bd9724..8efc830f924b 100644
--- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
+++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
@@ -164,10 +164,6 @@ def write_model(
     file_path = hf_hub_download(hf_repo_id, "depth_pro.pt")
     loaded = torch.load(file_path, weights_only=True)
 
-    # ensure state_dict is in float32
-    for key in loaded.keys():
-        loaded[key] = loaded[key].to(torch.float32)
-
     print("Converting model...")
     all_keys = list(loaded.keys())
     new_keys = convert_old_keys_to_new_keys(all_keys)

From a87d26a61bc6221311399297010e56f7338f5213 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Tue, 4 Feb 2025 21:16:15 +0500
Subject: [PATCH 131/151] use "Fov" instead of "FOV" in class names

---
 .../models/depth_pro/modeling_depth_pro.py         | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 26384139c267..aa12e995b3a2 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -677,7 +677,7 @@ def forward(self, features: List[torch.Tensor]) -> List[torch.Tensor]:
             Initializing with a config file does not load the weights associated with the model, only the
             configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
         use_fov_model (`bool`, *optional*, defaults to `True`):
-            Whether to use `DepthProFOVModel` to generate the field of view.
+            Whether to use `DepthProFovModel` to generate the field of view.
 """
 
 
@@ -932,7 +932,7 @@ def forward(self, hidden_states: List[torch.Tensor]) -> List[torch.Tensor]:
         return fused_hidden_states
 
 
-class DepthProFOVEncoder(nn.Module):
+class DepthProFovEncoder(nn.Module):
     def __init__(self, config: DepthProConfig):
         super().__init__()
         self.config = config
@@ -979,7 +979,7 @@ def forward(
         return features
 
 
-class DepthProFOVHead(nn.Module):
+class DepthProFovHead(nn.Module):
     def __init__(self, config: DepthProConfig):
         super().__init__()
         self.config = config
@@ -1020,18 +1020,18 @@ def forward(self, features: torch.Tensor) -> torch.Tensor:
         return features
 
 
-class DepthProFOVModel(nn.Module):
+class DepthProFovModel(nn.Module):
     def __init__(self, config: DepthProConfig):
         super().__init__()
         self.config = config
         self.fusion_hidden_size = config.fusion_hidden_size
 
-        self.fov_encoder = DepthProFOVEncoder(config)
+        self.fov_encoder = DepthProFovEncoder(config)
         self.conv = nn.Conv2d(
             self.fusion_hidden_size, self.fusion_hidden_size // 2, kernel_size=3, stride=2, padding=1
         )
         self.activation = nn.ReLU(inplace=True)
-        self.head = DepthProFOVHead(config)
+        self.head = DepthProFovHead(config)
 
     def forward(
         self,
@@ -1112,7 +1112,7 @@ def __init__(self, config, use_fov_model=None):
         self.head = DepthProDepthEstimationHead(config)
 
         # dinov2 (vit) like encoder
-        self.fov_model = DepthProFOVModel(config) if self.use_fov_model else None
+        self.fov_model = DepthProFovModel(config) if self.use_fov_model else None
 
         # Initialize weights and apply final processing
         self.post_init()

From 5fccbff8de4eac6a1a161a3f620b71148d7d3268 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Tue, 4 Feb 2025 21:18:45 +0500
Subject: [PATCH 132/151] use "Fov" instead of "FOV" in config docs

---
 src/transformers/models/depth_pro/configuration_depth_pro.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py
index ffc8033b55c4..36de741b704a 100644
--- a/src/transformers/models/depth_pro/configuration_depth_pro.py
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@@ -58,9 +58,9 @@ class DepthProConfig(PretrainedConfig):
         use_bias_in_fusion_residual (`bool`, *optional*, defaults to `True`):
             Whether to use bias in the pre-activate residual units of the fusion blocks.
         use_fov_model (`bool`, *optional*, defaults to `False`):
-            Whether to use `DepthProFOVModel` to generate the field of view.
+            Whether to use `DepthProFovModel` to generate the field of view.
         num_fov_head_layers (`int`, *optional*, defaults to 2):
-            Number of convolution layers in the head of `DepthProFOVModel`.
+            Number of convolution layers in the head of `DepthProFovModel`.
         image_model_config (`Union[Dict[str, Any], PretrainedConfig]`, *optional*):
             The configuration of the image encoder model, which is loaded using the [`AutoModel`] API.
             By default, Dinov2 model is used as backbone.

From 24f1413abd06110de00e5a53579d16bd62707d34 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Tue, 4 Feb 2025 21:44:46 +0500
Subject: [PATCH 133/151] remove prune_heads

---
 src/transformers/models/depth_pro/modeling_depth_pro.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index aa12e995b3a2..173adc8ac2d2 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -728,14 +728,6 @@ def __init__(self, config):
     def get_input_embeddings(self):
         return self.encoder.image_encoder.model.get_input_embeddings()
 
-    def _prune_heads(self, heads_to_prune):
-        """
-        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
-        class PreTrainedModel
-        """
-        self.encoder.patch_encoder.model._prune_heads(heads_to_prune)
-        self.encoder.image_encoder.model._prune_heads(heads_to_prune)
-
     @add_start_docstrings_to_model_forward(DEPTH_PRO_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(

From a3dab1840d6ef19d416a9bcdeeadf8e9274be44f Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Tue, 4 Feb 2025 22:10:38 +0500
Subject: [PATCH 134/151] update fusion stage

---
 .../depth_pro/convert_depth_pro_weights_to_hf.py | 16 +++++++++++-----
 .../models/depth_pro/modeling_depth_pro.py       | 13 +++++++++----
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
index 8efc830f924b..07aebbe18607 100644
--- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
+++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
@@ -75,14 +75,20 @@
     ),
 
     # fusion stage
-    r"decoder.fusions.(\d+).resnet(\d+).residual.(\d+).(weight|bias)": lambda match: (
-        f"fusion_stage.layers.{4-int(match.group(1))}.residual_layer{match.group(2)}.convolution{(int(match.group(3))+1)//2}.{match.group(4)}"
+    r"decoder.fusions.([1234]).resnet(\d+).residual.(\d+).(weight|bias)": lambda match: (
+        f"fusion_stage.intermediate.{4-int(match.group(1))}.residual_layer{match.group(2)}.convolution{(int(match.group(3))+1)//2}.{match.group(4)}"
     ),
-    r"decoder.fusions.(\d+).out_conv.(weight|bias)": lambda match: (
-        f"fusion_stage.layers.{4-int(match.group(1))}.projection.{match.group(2)}"
+    r"decoder.fusions.0.resnet(\d+).residual.(\d+).(weight|bias)": lambda match: (
+        f"fusion_stage.final.residual_layer{match.group(1)}.convolution{(int(match.group(2))+1)//2}.{match.group(3)}"
+    ),
+    r"decoder.fusions.([1234]).out_conv.(weight|bias)": lambda match: (
+        f"fusion_stage.intermediate.{4-int(match.group(1))}.projection.{match.group(2)}"
+    ),
+    r"decoder.fusions.0.out_conv.(weight|bias)": lambda match: (
+        f"fusion_stage.final.projection.{match.group(1)}"
     ),
     r"decoder.fusions.(\d+).deconv.(weight|bias)": lambda match: (
-        f"fusion_stage.layers.{4-int(match.group(1))}.deconv.{match.group(2)}"
+        f"fusion_stage.intermediate.{4-int(match.group(1))}.deconv.{match.group(2)}"
     ),
 }
 # fmt: on
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 173adc8ac2d2..d994d66eeeff 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -898,11 +898,12 @@ def __init__(self, config):
         self.config = config
 
         self.num_layers = len(config.intermediate_hook_ids) + len(config.scaled_images_ratios)
-        self.layers = nn.ModuleList()
+        self.intermediate = nn.ModuleList()
         for _ in range(self.num_layers - 1):
-            self.layers.append(DepthProFeatureFusionLayer(config))
+            self.intermediate.append(DepthProFeatureFusionLayer(config))
+
         # final layer doesnot require deconvolution
-        self.layers.append(DepthProFeatureFusionLayer(config, use_deconv=False))
+        self.final = DepthProFeatureFusionLayer(config, use_deconv=False)
 
     def forward(self, hidden_states: List[torch.Tensor]) -> List[torch.Tensor]:
         if self.num_layers != len(hidden_states):
@@ -913,7 +914,7 @@ def forward(self, hidden_states: List[torch.Tensor]) -> List[torch.Tensor]:
 
         fused_hidden_states = []
         fused_hidden_state = None
-        for hidden_state, layer in zip(hidden_states, self.layers):
+        for hidden_state, layer in zip(hidden_states[:-1], self.intermediate):
             if fused_hidden_state is None:
                 # first layer only uses the last hidden_state
                 fused_hidden_state = layer(hidden_state)
@@ -921,6 +922,10 @@ def forward(self, hidden_states: List[torch.Tensor]) -> List[torch.Tensor]:
                 fused_hidden_state = layer(fused_hidden_state, hidden_state)
             fused_hidden_states.append(fused_hidden_state)
 
+        hidden_state = hidden_states[-1]
+        fused_hidden_state = self.final(fused_hidden_state, hidden_state)
+        fused_hidden_states.append(fused_hidden_state)
+
         return fused_hidden_states
 
 

From 48eb534934fd7a63c1ad0482a300d45529875360 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Tue, 4 Feb 2025 22:26:41 +0500
Subject: [PATCH 135/151] use device in examples

---
 docs/source/en/model_doc/depth_pro.md                   | 6 ++++--
 src/transformers/models/depth_pro/modeling_depth_pro.py | 5 ++++-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/docs/source/en/model_doc/depth_pro.md b/docs/source/en/model_doc/depth_pro.md
index a701497caea8..00ebed799b2e 100644
--- a/docs/source/en/model_doc/depth_pro.md
+++ b/docs/source/en/model_doc/depth_pro.md
@@ -43,13 +43,15 @@ The DepthPro model processes an input image by first downsampling it at multiple
 >>> import torch
 >>> from transformers import DepthProImageProcessorFast, DepthProForDepthEstimation
 
+>>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
 >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
 >>> image = Image.open(requests.get(url, stream=True).raw)
 
 >>> image_processor = DepthProImageProcessorFast.from_pretrained("geetu040/DepthPro")
->>> model = DepthProForDepthEstimation.from_pretrained("geetu040/DepthPro")
+>>> model = DepthProForDepthEstimation.from_pretrained("geetu040/DepthPro").to(device)
 
->>> inputs = image_processor(images=image, return_tensors="pt")
+>>> inputs = image_processor(images=image, return_tensors="pt").to(device)
 
 >>> with torch.no_grad():
 ...     outputs = model(**inputs)
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index d994d66eeeff..e8421ab3bcea 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -1146,8 +1146,11 @@ def forward(
         >>> processor = AutoImageProcessor.from_pretrained(checkpoint)
         >>> model = DepthProForDepthEstimation.from_pretrained(checkpoint)
 
+        >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        >>> model.to(device)
+
         >>> # prepare image for the model
-        >>> inputs = processor(images=image, return_tensors="pt")
+        >>> inputs = processor(images=image, return_tensors="pt").to(device)
 
         >>> with torch.no_grad():
         ...     outputs = model(**inputs)

From ba37c9167edbd50d38d2e5efd2141472ce17b00c Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 5 Feb 2025 12:50:28 +0500
Subject: [PATCH 136/151] update processor

---
 .../convert_depth_pro_weights_to_hf.py        |  12 +-
 .../depth_pro/image_processing_depth_pro.py   |  24 +-
 .../image_processing_depth_pro_fast.py        | 373 ++++++------------
 .../test_image_processing_depth_pro.py        |  10 +-
 4 files changed, 125 insertions(+), 294 deletions(-)

diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
index 07aebbe18607..feebcd7fd9c0 100644
--- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
+++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
@@ -203,17 +203,7 @@ def write_model(
 
 
 def write_image_processor(output_dir: str):
-    image_processor = DepthProImageProcessorFast(
-        do_resize=True,
-        size={"height": 1536, "width": 1536},
-        resample=PILImageResampling.BILINEAR,
-        antialias=False,
-        do_rescale=True,
-        rescale_factor=1 / 255,
-        do_normalize=True,
-        image_mean=0.5,
-        image_std=0.5,
-    )
+    image_processor = DepthProImageProcessorFast()
     image_processor.save_pretrained(output_dir)
     return image_processor
 
diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py
index 9a9568fdde05..5871e0f764cd 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py
@@ -67,9 +67,6 @@ class DepthProImageProcessor(BaseImageProcessor):
         resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
             Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
             `preprocess` method.
-        antialias (`bool`, *optional*, defaults to `False`):
-            Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with
-            bilinear or bicubic modes and it is ignored otherwise.
         do_rescale (`bool`, *optional*, defaults to `True`):
             Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
             parameter in the `preprocess` method.
@@ -94,7 +91,6 @@ def __init__(
         do_resize: bool = True,
         size: Optional[Dict[str, int]] = None,
         resample: PILImageResampling = PILImageResampling.BILINEAR,
-        antialias: bool = False,
         do_rescale: bool = True,
         rescale_factor: Union[int, float] = 1 / 255,
         do_normalize: bool = True,
@@ -110,7 +106,6 @@ def __init__(
         self.do_normalize = do_normalize
         self.size = size
         self.resample = resample
-        self.antialias = antialias
         self.rescale_factor = rescale_factor
         self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
         self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
@@ -120,7 +115,6 @@ def resize(
         image: np.ndarray,
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BILINEAR,
-        antialias: bool = False,
         data_format: Optional[Union[str, ChannelDimension]] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
         **kwargs,
@@ -135,9 +129,6 @@ def resize(
                 Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
             resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
                 `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
-            antialias (`bool`, *optional*, defaults to `False`):
-                Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with
-                bilinear or bicubic modes and it is ignored otherwise.
             data_format (`ChannelDimension` or `str`, *optional*):
                 The channel dimension format for the output image. If unset, the channel dimension format of the input
                 image is used. Can be one of:
@@ -172,7 +163,6 @@ def resize(
             input=image_tensor,
             size=output_size,
             mode=pil_torch_interpolation_mapping[resample].value,
-            antialias=antialias,
         )
         resized_image = resized_image.squeeze(0).numpy()
         return resized_image
@@ -182,7 +172,6 @@ def _validate_input_arguments(
         do_resize: bool,
         size: Dict[str, int],
         resample: PILImageResampling,
-        antialias: bool,
         do_rescale: bool,
         rescale_factor: float,
         do_normalize: bool,
@@ -190,8 +179,8 @@ def _validate_input_arguments(
         image_std: Union[float, List[float]],
         data_format: Union[str, ChannelDimension],
     ):
-        if do_resize and None in (size, resample, antialias):
-            raise ValueError("Size, resample and antialias must be specified if do_resize is True.")
+        if do_resize and None in (size, resample):
+            raise ValueError("Size and resample must be specified if do_resize is True.")
 
         if do_rescale and rescale_factor is None:
             raise ValueError("Rescale factor must be specified if do_rescale is True.")
@@ -206,7 +195,6 @@ def preprocess(
         do_resize: Optional[bool] = None,
         size: Optional[Dict[str, int]] = None,
         resample: Optional[PILImageResampling] = None,
-        antialias: Optional[bool] = None,
         do_rescale: Optional[bool] = None,
         rescale_factor: Optional[float] = None,
         do_normalize: Optional[bool] = None,
@@ -231,9 +219,6 @@ def preprocess(
             resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`):
                 `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has
                 an effect if `do_resize` is set to `True`.
-            antialias (`bool`, *optional*, defaults to `False`):
-                Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with
-                bilinear or bicubic modes and it is ignored otherwise.
             do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
                 Whether to rescale the image values between [0 - 1].
             rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
@@ -267,7 +252,6 @@ def preprocess(
         do_rescale = do_rescale if do_rescale is not None else self.do_rescale
         do_normalize = do_normalize if do_normalize is not None else self.do_normalize
         resample = resample if resample is not None else self.resample
-        antialias = antialias if antialias is not None else self.antialias
         rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
         image_mean = image_mean if image_mean is not None else self.image_mean
         image_std = image_std if image_std is not None else self.image_std
@@ -285,7 +269,6 @@ def preprocess(
             do_resize=do_resize,
             size=size,
             resample=resample,
-            antialias=antialias,
             do_rescale=do_rescale,
             rescale_factor=rescale_factor,
             do_normalize=do_normalize,
@@ -321,7 +304,7 @@ def preprocess(
             # uses torch interpolation which requires ChannelDimension.FIRST
             if do_resize:
                 image = to_channel_dimension_format(image, ChannelDimension.FIRST, input_channel_dim=input_data_format)
-                image = self.resize(image=image, size=size, resample=resample, antialias=antialias)
+                image = self.resize(image=image, size=size, resample=resample)
                 image = to_channel_dimension_format(image, data_format, input_channel_dim=ChannelDimension.FIRST)
             else:
                 image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
@@ -387,7 +370,6 @@ def post_process_depth_estimation(
                     input=depth.unsqueeze(0).unsqueeze(1),
                     size=target_size,
                     mode=pil_torch_interpolation_mapping[self.resample].value,
-                    antialias=self.antialias,
                 ).squeeze()
 
             # inverse the depth
diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
index a56ae831960d..cc6c3feace82 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
@@ -14,26 +14,34 @@
 # limitations under the License.
 """Fast Image processor class for DepthPro."""
 
-import functools
 from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
 
 from ...image_processing_base import BatchFeature
-from ...image_processing_utils import get_size_dict
-from ...image_processing_utils_fast import BaseImageProcessorFast, SizeDict
-from ...image_transforms import FusedRescaleNormalize, NumpyToTensor, Rescale
+from ...image_processing_utils_fast import (
+    BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
+    BaseImageProcessorFast,
+    ChannelDimension,
+    get_image_size_for_max_height_width,
+    get_resize_output_image_size,
+    get_size_with_aspect_ratio,
+    group_images_by_shape,
+    reorder_images,
+)
 from ...image_utils import (
     IMAGENET_STANDARD_MEAN,
     IMAGENET_STANDARD_STD,
-    ChannelDimension,
-    ImageInput,
-    ImageType,
     PILImageResampling,
-    get_image_type,
-    make_list_of_images,
-    pil_torch_interpolation_mapping,
+    SizeDict,
+)
+from ...utils import (
+    TensorType,
+    add_start_docstrings,
+    is_torch_available,
+    is_torchvision_available,
+    is_torchvision_v2_available,
+    logging,
+    requires_backends,
 )
-from ...utils import TensorType, logging, requires_backends
-from ...utils.import_utils import is_torch_available, is_torchvision_available
 
 
 if TYPE_CHECKING:
@@ -47,268 +55,118 @@
 
 
 if is_torchvision_available():
-    from torchvision.transforms import Compose, Normalize, PILToTensor, Resize
+    from ...image_utils import pil_torch_interpolation_mapping
 
+    if is_torchvision_v2_available():
+        from torchvision.transforms.v2 import functional as F
+    else:
+        from torchvision.transforms import functional as F
 
+
+@add_start_docstrings(
+    "Constructs a fast DepthPro image processor.",
+    BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
+)
 class DepthProImageProcessorFast(BaseImageProcessorFast):
-    r"""
-    Constructs a DepthPro image processor.
-
-    Args:
-        do_resize (`bool`, *optional*, defaults to `True`):
-            Whether to resize the image's (height, width) dimensions to the specified `(size["height"],
-            size["width"])`. Can be overridden by the `do_resize` parameter in the `preprocess` method.
-        size (`dict`, *optional*, defaults to `{"height": 1536, "width": 1536}`):
-            Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
-            method.
-        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
-            Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
-            `preprocess` method.
-        antialias (`bool`, *optional*, defaults to `False`):
-            Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with
-            bilinear or bicubic modes and it is ignored otherwise.
-        do_rescale (`bool`, *optional*, defaults to `True`):
-            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
-            parameter in the `preprocess` method.
-        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
-            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
-            `preprocess` method.
-        do_normalize (`bool`, *optional*, defaults to `True`):
-            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
-            method.
-        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
-            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
-            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
-        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
-            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
-            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
-    """
-
-    model_input_names = ["pixel_values"]
-    _transform_params = [
-        "do_resize",
-        "do_rescale",
-        "do_normalize",
-        "size",
-        "resample",
-        "antialias",
-        "rescale_factor",
-        "image_mean",
-        "image_std",
-        "image_type",
-    ]
-
-    def __init__(
+    resample = PILImageResampling.BILINEAR
+    image_mean = IMAGENET_STANDARD_MEAN
+    image_std = IMAGENET_STANDARD_STD
+    size = {"height": 1536, "width": 1536}
+    do_resize = True
+    do_rescale = True
+    do_normalize = True
+
+    # Only difference with BaseImageProcessorFast.resize is that `antialias=False` in F.resize
+    def resize(
         self,
-        do_resize: bool = True,
-        size: Optional[Dict[str, int]] = None,
-        resample: PILImageResampling = PILImageResampling.BILINEAR,
-        antialias: bool = False,
-        do_rescale: bool = True,
-        rescale_factor: Union[int, float] = 1 / 255,
-        do_normalize: bool = True,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
+        image: "torch.Tensor",
+        size: SizeDict,
+        interpolation: "F.InterpolationMode" = None,
         **kwargs,
-    ):
-        super().__init__(**kwargs)
-        size = size if size is not None else {"height": 1536, "width": 1536}
-        size = get_size_dict(size)
-        self.do_resize = do_resize
-        self.do_rescale = do_rescale
-        self.do_normalize = do_normalize
-        self.size = size
-        self.resample = resample
-        self.antialias = antialias
-        self.rescale_factor = rescale_factor
-        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
-        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
-
-    def _build_transforms(
-        self,
-        do_resize: bool,
-        size: Dict[str, int],
-        resample: PILImageResampling,
-        antialias: bool,
-        do_rescale: bool,
-        rescale_factor: float,
-        do_normalize: bool,
-        image_mean: Union[float, List[float]],
-        image_std: Union[float, List[float]],
-        image_type: ImageType,
-    ) -> "Compose":
+    ) -> "torch.Tensor":
         """
-        Given the input settings build the image transforms using `torchvision.transforms.Compose`.
+        Resize an image to `(size["height"], size["width"])`.
+
+        Args:
+            image (`torch.Tensor`):
+                Image to resize.
+            size (`SizeDict`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            resample (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
+                `InterpolationMode` filter to use when resizing the image e.g. `InterpolationMode.BICUBIC`.
+
+        Returns:
+            `torch.Tensor`: The resized image.
         """
-        transforms = []
-
-        # All PIL and numpy values need to be converted to a torch tensor
-        # to keep cross compatibility with slow image processors
-        if image_type == ImageType.PIL:
-            transforms.append(PILToTensor())
-
-        elif image_type == ImageType.NUMPY:
-            transforms.append(NumpyToTensor())
-
-        # We can combine rescale and normalize into a single operation for speed
-        if do_rescale and do_normalize:
-            transforms.append(FusedRescaleNormalize(image_mean, image_std, rescale_factor=rescale_factor))
-        elif do_rescale:
-            transforms.append(Rescale(rescale_factor=rescale_factor))
-        elif do_normalize:
-            transforms.append(Normalize(image_mean, image_std))
-
-        # depth-pro scales the image before resizing it
-        if do_resize:
-            transforms.append(
-                Resize(
-                    (size["height"], size["width"]),
-                    interpolation=pil_torch_interpolation_mapping[resample],
-                    antialias=antialias,
-                )
+        interpolation = interpolation if interpolation is not None else F.InterpolationMode.BILINEAR
+        if size.shortest_edge and size.longest_edge:
+            # Resize the image so that the shortest edge or the longest edge is of the given size
+            # while maintaining the aspect ratio of the original image.
+            new_size = get_size_with_aspect_ratio(
+                image.size()[-2:],
+                size.shortest_edge,
+                size.longest_edge,
             )
+        elif size.shortest_edge:
+            new_size = get_resize_output_image_size(
+                image,
+                size=size.shortest_edge,
+                default_to_square=False,
+                input_data_format=ChannelDimension.FIRST,
+            )
+        elif size.max_height and size.max_width:
+            new_size = get_image_size_for_max_height_width(image.size()[-2:], size.max_height, size.max_width)
+        elif size.height and size.width:
+            new_size = (size.height, size.width)
+        else:
+            raise ValueError(
+                "Size must contain 'height' and 'width' keys, or 'max_height' and 'max_width', or 'shortest_edge' key. Got"
+                f" {size}."
+            )
+        return F.resize(image, new_size, interpolation=interpolation, antialias=False)
 
-        return Compose(transforms)
-
-    @functools.lru_cache(maxsize=1)
-    def _validate_input_arguments(
+    # DepthPro resizes image after rescaling and normalizing,
+    # which makes it different from BaseImageProcessorFast._preprocess
+    def _preprocess(
         self,
-        return_tensors: Union[str, TensorType],
+        images: List["torch.Tensor"],
         do_resize: bool,
-        size: Dict[str, int],
-        resample: PILImageResampling,
-        antialias: bool,
+        size: SizeDict,
+        interpolation: Optional["F.InterpolationMode"],
+        do_center_crop: bool,
+        crop_size: SizeDict,
         do_rescale: bool,
         rescale_factor: float,
         do_normalize: bool,
-        image_mean: Union[float, List[float]],
-        image_std: Union[float, List[float]],
-        data_format: Union[str, ChannelDimension],
-        image_type: ImageType,
-    ):
-        if return_tensors != "pt":
-            raise ValueError("Only returning PyTorch tensors is currently supported.")
-
-        if data_format != ChannelDimension.FIRST:
-            raise ValueError("Only channel first data format is currently supported.")
-
-        if do_resize and None in (size, resample, antialias):
-            raise ValueError("Size, resample and antialias must be specified if do_resize is True.")
+        image_mean: Optional[Union[float, List[float]]],
+        image_std: Optional[Union[float, List[float]]],
+        return_tensors: Optional[Union[str, TensorType]],
+    ) -> BatchFeature:
+        # Group images by size for batched scaling
+        grouped_images, grouped_images_index = group_images_by_shape(images)
+        scaled_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_center_crop:
+                stacked_images = self.center_crop(stacked_images, crop_size)
+            # Fused rescale and normalize
+            stacked_images = self.rescale_and_normalize(
+                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            )
+            scaled_images_grouped[shape] = stacked_images
+        scaled_images = reorder_images(scaled_images_grouped, grouped_images_index)
 
-        if do_rescale and rescale_factor is None:
-            raise ValueError("Rescale factor must be specified if do_rescale is True.")
+        # Group images by size for batched resizing
+        grouped_images, grouped_images_index = group_images_by_shape(scaled_images)
+        resized_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_resize:
+                stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation)
+            resized_images_grouped[shape] = stacked_images
+        resized_images = reorder_images(resized_images_grouped, grouped_images_index)
 
-        if do_normalize and None in (image_mean, image_std):
-            raise ValueError("Image mean and standard deviation must be specified if do_normalize is True.")
+        processed_images = torch.stack(resized_images, dim=0) if return_tensors else resized_images
 
-    def preprocess(
-        self,
-        images: ImageInput,
-        do_resize: Optional[bool] = None,
-        size: Optional[Dict[str, int]] = None,
-        resample: Optional[PILImageResampling] = None,
-        antialias: Optional[bool] = None,
-        do_rescale: Optional[bool] = None,
-        rescale_factor: Optional[float] = None,
-        do_normalize: Optional[bool] = None,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        return_tensors: Optional[Union[str, TensorType]] = "pt",
-        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ):
-        """
-        Preprocess an image or batch of images.
-
-        Args:
-            images (`ImageInput`):
-                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
-                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
-            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
-                Whether to resize the image.
-            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
-                Dictionary in the format `{"height": h, "width": w}` specifying the size of the output image after
-                resizing.
-            resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`):
-                `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has
-                an effect if `do_resize` is set to `True`.
-            antialias (`bool`, *optional*, defaults to `False`):
-                Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with
-                bilinear or bicubic modes and it is ignored otherwise.
-            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
-                Whether to rescale the image values between [0 - 1].
-            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
-                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
-            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
-                Whether to normalize the image.
-            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
-                Image mean to use if `do_normalize` is set to `True`.
-            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
-                Image standard deviation to use if `do_normalize` is set to `True`.
-            return_tensors (`str` or `TensorType`, *optional*):
-                The type of tensors to return. Only "pt" is supported
-            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
-                The channel dimension format for the output image. The following formats are currently supported:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-            input_data_format (`ChannelDimension` or `str`, *optional*):
-                The channel dimension format for the input image. If unset, the channel dimension format is inferred
-                from the input image. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
-        """
-        do_resize = do_resize if do_resize is not None else self.do_resize
-        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
-        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
-        resample = resample if resample is not None else self.resample
-        antialias = antialias if antialias is not None else self.antialias
-        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
-        image_mean = image_mean if image_mean is not None else self.image_mean
-        image_std = image_std if image_std is not None else self.image_std
-        size = size if size is not None else self.size
-        # Make hashable for cache
-        size = SizeDict(**size)
-        image_mean = tuple(image_mean) if isinstance(image_mean, list) else image_mean
-        image_std = tuple(image_std) if isinstance(image_std, list) else image_std
-
-        images = make_list_of_images(images)
-        image_type = get_image_type(images[0])
-
-        if image_type not in [ImageType.PIL, ImageType.TORCH, ImageType.NUMPY]:
-            raise ValueError(f"Unsupported input image type {image_type}")
-
-        self._validate_input_arguments(
-            do_resize=do_resize,
-            size=size,
-            resample=resample,
-            antialias=antialias,
-            do_rescale=do_rescale,
-            rescale_factor=rescale_factor,
-            do_normalize=do_normalize,
-            image_mean=image_mean,
-            image_std=image_std,
-            return_tensors=return_tensors,
-            data_format=data_format,
-            image_type=image_type,
-        )
-
-        transforms = self.get_transforms(
-            do_resize=do_resize,
-            do_rescale=do_rescale,
-            do_normalize=do_normalize,
-            size=size,
-            resample=resample,
-            antialias=antialias,
-            rescale_factor=rescale_factor,
-            image_mean=image_mean,
-            image_std=image_std,
-            image_type=image_type,
-        )
-        transformed_images = [transforms(image) for image in images]
-
-        data = {"pixel_values": torch.stack(transformed_images, dim=0)}
-        return BatchFeature(data, tensor_type=return_tensors)
+        return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
 
     # Copied from transformers.models.depth_pro.image_processing_depth_pro.DepthProImageProcessor.post_process_depth_estimation
     def post_process_depth_estimation(
@@ -367,7 +225,6 @@ def post_process_depth_estimation(
                     input=depth.unsqueeze(0).unsqueeze(1),
                     size=target_size,
                     mode=pil_torch_interpolation_mapping[self.resample].value,
-                    antialias=self.antialias,
                 ).squeeze()
 
             # inverse the depth
diff --git a/tests/models/depth_pro/test_image_processing_depth_pro.py b/tests/models/depth_pro/test_image_processing_depth_pro.py
index 0e830698c0a1..e754e53e009f 100644
--- a/tests/models/depth_pro/test_image_processing_depth_pro.py
+++ b/tests/models/depth_pro/test_image_processing_depth_pro.py
@@ -16,14 +16,17 @@
 
 import unittest
 
-from transformers.file_utils import is_vision_available
 from transformers.testing_utils import is_flaky, require_torch, require_vision
+from transformers.utils import is_torchvision_available, is_vision_available
 
 from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
 
 
 if is_vision_available():
-    from transformers import DepthProImageProcessor, DepthProImageProcessorFast
+    from transformers import DepthProImageProcessor
+
+    if is_torchvision_available():
+        from transformers import DepthProImageProcessorFast
 
 
 class DepthProImageProcessingTester(unittest.TestCase):
@@ -83,7 +86,7 @@ def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=F
 @require_vision
 class DepthProImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
     image_processing_class = DepthProImageProcessor if is_vision_available() else None
-    fast_image_processing_class = DepthProImageProcessorFast if is_vision_available() else None
+    fast_image_processing_class = DepthProImageProcessorFast if is_torchvision_available() else None
 
     def setUp(self):
         super().setUp()
@@ -103,7 +106,6 @@ def test_image_processor_properties(self):
         self.assertTrue(hasattr(image_processing, "do_rescale"))
         self.assertTrue(hasattr(image_processing, "rescale_factor"))
         self.assertTrue(hasattr(image_processing, "resample"))
-        self.assertTrue(hasattr(image_processing, "antialias"))
 
     def test_image_processor_from_dict_with_kwargs(self):
         image_processor = self.image_processing_class.from_dict(self.image_processor_dict)

From 949ecb969a8f2459bc2975856ff808240f95ff98 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 5 Feb 2025 12:58:10 +0500
Subject: [PATCH 137/151] ruff fixes

---
 .../models/depth_pro/convert_depth_pro_weights_to_hf.py          | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
index feebcd7fd9c0..f2cfc0bdd758 100644
--- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
+++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
@@ -25,7 +25,6 @@
     DepthProForDepthEstimation,
     DepthProImageProcessorFast,
 )
-from transformers.image_utils import PILImageResampling
 
 
 # fmt: off

From 0e2861d1af01fbc33ac1d2101c2b03a01f42bfab Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 5 Feb 2025 13:34:50 +0500
Subject: [PATCH 138/151] add do_rescale in image_processor_dict

---
 tests/models/depth_pro/test_image_processing_depth_pro.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/models/depth_pro/test_image_processing_depth_pro.py b/tests/models/depth_pro/test_image_processing_depth_pro.py
index e754e53e009f..b30931a86cdb 100644
--- a/tests/models/depth_pro/test_image_processing_depth_pro.py
+++ b/tests/models/depth_pro/test_image_processing_depth_pro.py
@@ -40,6 +40,7 @@ def __init__(
         max_resolution=400,
         do_resize=True,
         size=None,
+        do_rescale=True,
         do_normalize=True,
         image_mean=[0.5, 0.5, 0.5],
         image_std=[0.5, 0.5, 0.5],
@@ -54,6 +55,7 @@ def __init__(
         self.max_resolution = max_resolution
         self.do_resize = do_resize
         self.size = size
+        self.do_rescale = do_rescale
         self.do_normalize = do_normalize
         self.image_mean = image_mean
         self.image_std = image_std
@@ -62,6 +64,7 @@ def prepare_image_processor_dict(self):
         return {
             "image_mean": self.image_mean,
             "image_std": self.image_std,
+            "do_rescale": self.do_rescale,
             "do_normalize": self.do_normalize,
             "do_resize": self.do_resize,
             "size": self.size,

From a6efedb8d80245496cefb89c77ce0a9732888a9c Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 5 Feb 2025 13:36:09 +0500
Subject: [PATCH 139/151] skip test: test_fast_is_faster_than_slow

---
 tests/models/depth_pro/test_image_processing_depth_pro.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/models/depth_pro/test_image_processing_depth_pro.py b/tests/models/depth_pro/test_image_processing_depth_pro.py
index b30931a86cdb..13f329018acd 100644
--- a/tests/models/depth_pro/test_image_processing_depth_pro.py
+++ b/tests/models/depth_pro/test_image_processing_depth_pro.py
@@ -117,8 +117,8 @@ def test_image_processor_from_dict_with_kwargs(self):
         image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42)
         self.assertEqual(image_processor.size, {"height": 42, "width": 42})
 
-    @is_flaky(
-        description="fast and slow, both processors use torch implementation, see: https://github.com/huggingface/transformers/issues/34920",
+    @unittest.skip(
+        reason="both processors (fast and slow) use torch for resizing, check: https://github.com/huggingface/transformers/issues/34920",
     )
     def test_fast_is_faster_than_slow(self):
-        super().test_fast_is_faster_than_slow()
+        pass

From 4d8f927a86ce6ad4fcde12daeb88ec7052d2749a Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 5 Feb 2025 13:38:15 +0500
Subject: [PATCH 140/151] ruff formatting

---
 tests/models/depth_pro/test_image_processing_depth_pro.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/depth_pro/test_image_processing_depth_pro.py b/tests/models/depth_pro/test_image_processing_depth_pro.py
index 13f329018acd..434741b13e1b 100644
--- a/tests/models/depth_pro/test_image_processing_depth_pro.py
+++ b/tests/models/depth_pro/test_image_processing_depth_pro.py
@@ -16,7 +16,7 @@
 
 import unittest
 
-from transformers.testing_utils import is_flaky, require_torch, require_vision
+from transformers.testing_utils import require_torch, require_vision
 from transformers.utils import is_torchvision_available, is_vision_available
 
 from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs

From dd8de27023dbe0d4a294fe598366a60cbbd3449f Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 5 Feb 2025 14:03:54 +0500
Subject: [PATCH 141/151] DepthProImageProcessorFast in other files

---
 src/transformers/__init__.py                        | 2 ++
 src/transformers/utils/dummy_torchvision_objects.py | 7 +++++++
 2 files changed, 9 insertions(+)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index a0144429364b..d9db0a0fd6e3 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1315,6 +1315,7 @@
     _import_structure["models.convnext"].append("ConvNextImageProcessorFast")
     _import_structure["models.deformable_detr"].append("DeformableDetrImageProcessorFast")
     _import_structure["models.deit"].append("DeiTImageProcessorFast")
+    _import_structure["models.depth_pro"].append("DepthProImageProcessorFast")
     _import_structure["models.detr"].append("DetrImageProcessorFast")
     _import_structure["models.llava"].append("LlavaImageProcessorFast")
     _import_structure["models.llava_next"].append("LlavaNextImageProcessorFast")
@@ -6466,6 +6467,7 @@
         from .models.convnext import ConvNextImageProcessorFast
         from .models.deformable_detr import DeformableDetrImageProcessorFast
         from .models.deit import DeiTImageProcessorFast
+        from .models.depth_pro import DepthProImageProcessorFast
         from .models.detr import DetrImageProcessorFast
         from .models.llava import LlavaImageProcessorFast
         from .models.llava_next import LlavaNextImageProcessorFast
diff --git a/src/transformers/utils/dummy_torchvision_objects.py b/src/transformers/utils/dummy_torchvision_objects.py
index f1b75efc2071..87b60fbc0463 100644
--- a/src/transformers/utils/dummy_torchvision_objects.py
+++ b/src/transformers/utils/dummy_torchvision_objects.py
@@ -44,6 +44,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torchvision"])
 
 
+class DepthProImageProcessorFast(metaclass=DummyObject):
+    _backends = ["torchvision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torchvision"])
+
+
 class DetrImageProcessorFast(metaclass=DummyObject):
     _backends = ["torchvision"]
 

From 5caa0bd8f9f7463b98410c04e6cfe8fef3adee18 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 5 Feb 2025 18:07:45 +0500
Subject: [PATCH 142/151] revert antialias removal

---
 .../depth_pro/image_processing_depth_pro.py   | 24 ++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py
index 5871e0f764cd..9a9568fdde05 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py
@@ -67,6 +67,9 @@ class DepthProImageProcessor(BaseImageProcessor):
         resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
             Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
             `preprocess` method.
+        antialias (`bool`, *optional*, defaults to `False`):
+            Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with
+            bilinear or bicubic modes and it is ignored otherwise.
         do_rescale (`bool`, *optional*, defaults to `True`):
             Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
             parameter in the `preprocess` method.
@@ -91,6 +94,7 @@ def __init__(
         do_resize: bool = True,
         size: Optional[Dict[str, int]] = None,
         resample: PILImageResampling = PILImageResampling.BILINEAR,
+        antialias: bool = False,
         do_rescale: bool = True,
         rescale_factor: Union[int, float] = 1 / 255,
         do_normalize: bool = True,
@@ -106,6 +110,7 @@ def __init__(
         self.do_normalize = do_normalize
         self.size = size
         self.resample = resample
+        self.antialias = antialias
         self.rescale_factor = rescale_factor
         self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
         self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
@@ -115,6 +120,7 @@ def resize(
         image: np.ndarray,
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BILINEAR,
+        antialias: bool = False,
         data_format: Optional[Union[str, ChannelDimension]] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
         **kwargs,
@@ -129,6 +135,9 @@ def resize(
                 Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
             resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
                 `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
+            antialias (`bool`, *optional*, defaults to `False`):
+                Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with
+                bilinear or bicubic modes and it is ignored otherwise.
             data_format (`ChannelDimension` or `str`, *optional*):
                 The channel dimension format for the output image. If unset, the channel dimension format of the input
                 image is used. Can be one of:
@@ -163,6 +172,7 @@ def resize(
             input=image_tensor,
             size=output_size,
             mode=pil_torch_interpolation_mapping[resample].value,
+            antialias=antialias,
         )
         resized_image = resized_image.squeeze(0).numpy()
         return resized_image
@@ -172,6 +182,7 @@ def _validate_input_arguments(
         do_resize: bool,
         size: Dict[str, int],
         resample: PILImageResampling,
+        antialias: bool,
         do_rescale: bool,
         rescale_factor: float,
         do_normalize: bool,
@@ -179,8 +190,8 @@ def _validate_input_arguments(
         image_std: Union[float, List[float]],
         data_format: Union[str, ChannelDimension],
     ):
-        if do_resize and None in (size, resample):
-            raise ValueError("Size and resample must be specified if do_resize is True.")
+        if do_resize and None in (size, resample, antialias):
+            raise ValueError("Size, resample and antialias must be specified if do_resize is True.")
 
         if do_rescale and rescale_factor is None:
             raise ValueError("Rescale factor must be specified if do_rescale is True.")
@@ -195,6 +206,7 @@ def preprocess(
         do_resize: Optional[bool] = None,
         size: Optional[Dict[str, int]] = None,
         resample: Optional[PILImageResampling] = None,
+        antialias: Optional[bool] = None,
         do_rescale: Optional[bool] = None,
         rescale_factor: Optional[float] = None,
         do_normalize: Optional[bool] = None,
@@ -219,6 +231,9 @@ def preprocess(
             resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`):
                 `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has
                 an effect if `do_resize` is set to `True`.
+            antialias (`bool`, *optional*, defaults to `False`):
+                Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with
+                bilinear or bicubic modes and it is ignored otherwise.
             do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
                 Whether to rescale the image values between [0 - 1].
             rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
@@ -252,6 +267,7 @@ def preprocess(
         do_rescale = do_rescale if do_rescale is not None else self.do_rescale
         do_normalize = do_normalize if do_normalize is not None else self.do_normalize
         resample = resample if resample is not None else self.resample
+        antialias = antialias if antialias is not None else self.antialias
         rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
         image_mean = image_mean if image_mean is not None else self.image_mean
         image_std = image_std if image_std is not None else self.image_std
@@ -269,6 +285,7 @@ def preprocess(
             do_resize=do_resize,
             size=size,
             resample=resample,
+            antialias=antialias,
             do_rescale=do_rescale,
             rescale_factor=rescale_factor,
             do_normalize=do_normalize,
@@ -304,7 +321,7 @@ def preprocess(
             # uses torch interpolation which requires ChannelDimension.FIRST
             if do_resize:
                 image = to_channel_dimension_format(image, ChannelDimension.FIRST, input_channel_dim=input_data_format)
-                image = self.resize(image=image, size=size, resample=resample)
+                image = self.resize(image=image, size=size, resample=resample, antialias=antialias)
                 image = to_channel_dimension_format(image, data_format, input_channel_dim=ChannelDimension.FIRST)
             else:
                 image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
@@ -370,6 +387,7 @@ def post_process_depth_estimation(
                     input=depth.unsqueeze(0).unsqueeze(1),
                     size=target_size,
                     mode=pil_torch_interpolation_mapping[self.resample].value,
+                    antialias=self.antialias,
                 ).squeeze()
 
             # inverse the depth

From 3ae1134780ae236872985523d9c0a444eabcc179 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 5 Feb 2025 18:45:35 +0500
Subject: [PATCH 143/151] add antialias in BaseImageProcessorFast

---
 .../image_processing_utils_fast.py            | 25 +++++++-
 .../image_processing_depth_pro_fast.py        | 62 +++----------------
 2 files changed, 31 insertions(+), 56 deletions(-)

diff --git a/src/transformers/image_processing_utils_fast.py b/src/transformers/image_processing_utils_fast.py
index cb7d1c46aa79..20dc06e53b3b 100644
--- a/src/transformers/image_processing_utils_fast.py
+++ b/src/transformers/image_processing_utils_fast.py
@@ -132,6 +132,7 @@ class DefaultFastImageProcessorInitKwargs(TypedDict, total=False):
     size: Optional[Dict[str, int]]
     default_to_square: Optional[bool]
     resample: Optional[Union["PILImageResampling", "F.InterpolationMode"]]
+    antialias: Optional[bool]
     do_center_crop: Optional[bool]
     crop_size: Optional[Dict[str, int]]
     do_rescale: Optional[bool]
@@ -163,6 +164,9 @@ class DefaultFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorInitKwa
         resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
             Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
             overridden by the `resample` parameter in the `preprocess` method.
+        antialias (`bool`, *optional*, defaults to `True`):
+            Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with
+            bilinear or bicubic modes and it is ignored otherwise.
         do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
             Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the
             `preprocess` method.
@@ -203,6 +207,9 @@ class DefaultFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorInitKwa
         resample (`PILImageResampling` or `InterpolationMode`, *optional*, defaults to `self.resample`):
             Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
             has an effect if `do_resize` is set to `True`.
+        antialias (`bool`, *optional*, defaults to `True`):
+            Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with
+            bilinear or bicubic modes and it is ignored otherwise.
         do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
             Whether to center crop the image.
         crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`):
@@ -243,6 +250,7 @@ class DefaultFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorInitKwa
 )
 class BaseImageProcessorFast(BaseImageProcessor):
     resample = None
+    antialias = None
     image_mean = None
     image_std = None
     size = None
@@ -283,6 +291,7 @@ def resize(
         image: "torch.Tensor",
         size: SizeDict,
         interpolation: "F.InterpolationMode" = None,
+        antialias: bool = True,
         **kwargs,
     ) -> "torch.Tensor":
         """
@@ -295,11 +304,15 @@ def resize(
                 Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
             resample (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
                 `InterpolationMode` filter to use when resizing the image e.g. `InterpolationMode.BICUBIC`.
+            antialias (`bool`, *optional*, defaults to `True`):
+                Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with
+                bilinear or bicubic modes and it is ignored otherwise.
 
         Returns:
             `torch.Tensor`: The resized image.
         """
         interpolation = interpolation if interpolation is not None else F.InterpolationMode.BILINEAR
+        antialias = antialias if antialias is not None else True
         if size.shortest_edge and size.longest_edge:
             # Resize the image so that the shortest edge or the longest edge is of the given size
             # while maintaining the aspect ratio of the original image.
@@ -324,7 +337,7 @@ def resize(
                 "Size must contain 'height' and 'width' keys, or 'max_height' and 'max_width', or 'shortest_edge' key. Got"
                 f" {size}."
             )
-        return F.resize(image, new_size, interpolation=interpolation)
+        return F.resize(image, new_size, interpolation=interpolation, antialias=antialias)
 
     def rescale(
         self,
@@ -578,6 +591,7 @@ def preprocess(
         image_std = kwargs.pop("image_std")
         data_format = kwargs.pop("data_format")
         resample = kwargs.pop("resample")
+        antialias = kwargs.pop("antialias")
 
         # Make hashable for cache
         size = SizeDict(**get_size_dict(size=size, default_to_square=default_to_square)) if size is not None else None
@@ -606,6 +620,7 @@ def preprocess(
             size=size,
             crop_size=crop_size,
             interpolation=interpolation,
+            antialias=antialias,
             image_mean=image_mean,
             image_std=image_std,
             **kwargs,
@@ -617,6 +632,7 @@ def _preprocess(
         do_resize: bool,
         size: SizeDict,
         interpolation: Optional["F.InterpolationMode"],
+        antialias: Optional[bool],
         do_center_crop: bool,
         crop_size: SizeDict,
         do_rescale: bool,
@@ -631,7 +647,12 @@ def _preprocess(
         resized_images_grouped = {}
         for shape, stacked_images in grouped_images.items():
             if do_resize:
-                stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation)
+                stacked_images = self.resize(
+                    image=stacked_images,
+                    size=size,
+                    interpolation=interpolation,
+                    antialias=antialias,
+                )
             resized_images_grouped[shape] = stacked_images
         resized_images = reorder_images(resized_images_grouped, grouped_images_index)
 
diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
index cc6c3feace82..2b9870c6dc0a 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
@@ -20,10 +20,6 @@
 from ...image_processing_utils_fast import (
     BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
     BaseImageProcessorFast,
-    ChannelDimension,
-    get_image_size_for_max_height_width,
-    get_resize_output_image_size,
-    get_size_with_aspect_ratio,
     group_images_by_shape,
     reorder_images,
 )
@@ -69,6 +65,7 @@
 )
 class DepthProImageProcessorFast(BaseImageProcessorFast):
     resample = PILImageResampling.BILINEAR
+    antialias = False
     image_mean = IMAGENET_STANDARD_MEAN
     image_std = IMAGENET_STANDARD_STD
     size = {"height": 1536, "width": 1536}
@@ -76,55 +73,6 @@ class DepthProImageProcessorFast(BaseImageProcessorFast):
     do_rescale = True
     do_normalize = True
 
-    # Only difference with BaseImageProcessorFast.resize is that `antialias=False` in F.resize
-    def resize(
-        self,
-        image: "torch.Tensor",
-        size: SizeDict,
-        interpolation: "F.InterpolationMode" = None,
-        **kwargs,
-    ) -> "torch.Tensor":
-        """
-        Resize an image to `(size["height"], size["width"])`.
-
-        Args:
-            image (`torch.Tensor`):
-                Image to resize.
-            size (`SizeDict`):
-                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
-            resample (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
-                `InterpolationMode` filter to use when resizing the image e.g. `InterpolationMode.BICUBIC`.
-
-        Returns:
-            `torch.Tensor`: The resized image.
-        """
-        interpolation = interpolation if interpolation is not None else F.InterpolationMode.BILINEAR
-        if size.shortest_edge and size.longest_edge:
-            # Resize the image so that the shortest edge or the longest edge is of the given size
-            # while maintaining the aspect ratio of the original image.
-            new_size = get_size_with_aspect_ratio(
-                image.size()[-2:],
-                size.shortest_edge,
-                size.longest_edge,
-            )
-        elif size.shortest_edge:
-            new_size = get_resize_output_image_size(
-                image,
-                size=size.shortest_edge,
-                default_to_square=False,
-                input_data_format=ChannelDimension.FIRST,
-            )
-        elif size.max_height and size.max_width:
-            new_size = get_image_size_for_max_height_width(image.size()[-2:], size.max_height, size.max_width)
-        elif size.height and size.width:
-            new_size = (size.height, size.width)
-        else:
-            raise ValueError(
-                "Size must contain 'height' and 'width' keys, or 'max_height' and 'max_width', or 'shortest_edge' key. Got"
-                f" {size}."
-            )
-        return F.resize(image, new_size, interpolation=interpolation, antialias=False)
-
     # DepthPro resizes image after rescaling and normalizing,
     # which makes it different from BaseImageProcessorFast._preprocess
     def _preprocess(
@@ -133,6 +81,7 @@ def _preprocess(
         do_resize: bool,
         size: SizeDict,
         interpolation: Optional["F.InterpolationMode"],
+        antialias: bool,
         do_center_crop: bool,
         crop_size: SizeDict,
         do_rescale: bool,
@@ -160,7 +109,12 @@ def _preprocess(
         resized_images_grouped = {}
         for shape, stacked_images in grouped_images.items():
             if do_resize:
-                stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation)
+                stacked_images = self.resize(
+                    image=stacked_images,
+                    size=size,
+                    interpolation=interpolation,
+                    antialias=antialias,
+                )
             resized_images_grouped[shape] = stacked_images
         resized_images = reorder_images(resized_images_grouped, grouped_images_index)
 

From 8372ad9d7437ddcac07a4d0578230e7974c154a2 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 5 Feb 2025 19:40:29 +0500
Subject: [PATCH 144/151] Revert "revert antialias removal"

This reverts commit 5caa0bd8f9f7463b98410c04e6cfe8fef3adee18.
---
 .../depth_pro/image_processing_depth_pro.py   | 24 +++----------------
 1 file changed, 3 insertions(+), 21 deletions(-)

diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py
index 9a9568fdde05..5871e0f764cd 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py
@@ -67,9 +67,6 @@ class DepthProImageProcessor(BaseImageProcessor):
         resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
             Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
             `preprocess` method.
-        antialias (`bool`, *optional*, defaults to `False`):
-            Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with
-            bilinear or bicubic modes and it is ignored otherwise.
         do_rescale (`bool`, *optional*, defaults to `True`):
             Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
             parameter in the `preprocess` method.
@@ -94,7 +91,6 @@ def __init__(
         do_resize: bool = True,
         size: Optional[Dict[str, int]] = None,
         resample: PILImageResampling = PILImageResampling.BILINEAR,
-        antialias: bool = False,
         do_rescale: bool = True,
         rescale_factor: Union[int, float] = 1 / 255,
         do_normalize: bool = True,
@@ -110,7 +106,6 @@ def __init__(
         self.do_normalize = do_normalize
         self.size = size
         self.resample = resample
-        self.antialias = antialias
         self.rescale_factor = rescale_factor
         self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
         self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
@@ -120,7 +115,6 @@ def resize(
         image: np.ndarray,
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BILINEAR,
-        antialias: bool = False,
         data_format: Optional[Union[str, ChannelDimension]] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
         **kwargs,
@@ -135,9 +129,6 @@ def resize(
                 Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
             resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
                 `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
-            antialias (`bool`, *optional*, defaults to `False`):
-                Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with
-                bilinear or bicubic modes and it is ignored otherwise.
             data_format (`ChannelDimension` or `str`, *optional*):
                 The channel dimension format for the output image. If unset, the channel dimension format of the input
                 image is used. Can be one of:
@@ -172,7 +163,6 @@ def resize(
             input=image_tensor,
             size=output_size,
             mode=pil_torch_interpolation_mapping[resample].value,
-            antialias=antialias,
         )
         resized_image = resized_image.squeeze(0).numpy()
         return resized_image
@@ -182,7 +172,6 @@ def _validate_input_arguments(
         do_resize: bool,
         size: Dict[str, int],
         resample: PILImageResampling,
-        antialias: bool,
         do_rescale: bool,
         rescale_factor: float,
         do_normalize: bool,
@@ -190,8 +179,8 @@ def _validate_input_arguments(
         image_std: Union[float, List[float]],
         data_format: Union[str, ChannelDimension],
     ):
-        if do_resize and None in (size, resample, antialias):
-            raise ValueError("Size, resample and antialias must be specified if do_resize is True.")
+        if do_resize and None in (size, resample):
+            raise ValueError("Size and resample must be specified if do_resize is True.")
 
         if do_rescale and rescale_factor is None:
             raise ValueError("Rescale factor must be specified if do_rescale is True.")
@@ -206,7 +195,6 @@ def preprocess(
         do_resize: Optional[bool] = None,
         size: Optional[Dict[str, int]] = None,
         resample: Optional[PILImageResampling] = None,
-        antialias: Optional[bool] = None,
         do_rescale: Optional[bool] = None,
         rescale_factor: Optional[float] = None,
         do_normalize: Optional[bool] = None,
@@ -231,9 +219,6 @@ def preprocess(
             resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`):
                 `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has
                 an effect if `do_resize` is set to `True`.
-            antialias (`bool`, *optional*, defaults to `False`):
-                Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with
-                bilinear or bicubic modes and it is ignored otherwise.
             do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
                 Whether to rescale the image values between [0 - 1].
             rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
@@ -267,7 +252,6 @@ def preprocess(
         do_rescale = do_rescale if do_rescale is not None else self.do_rescale
         do_normalize = do_normalize if do_normalize is not None else self.do_normalize
         resample = resample if resample is not None else self.resample
-        antialias = antialias if antialias is not None else self.antialias
         rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
         image_mean = image_mean if image_mean is not None else self.image_mean
         image_std = image_std if image_std is not None else self.image_std
@@ -285,7 +269,6 @@ def preprocess(
             do_resize=do_resize,
             size=size,
             resample=resample,
-            antialias=antialias,
             do_rescale=do_rescale,
             rescale_factor=rescale_factor,
             do_normalize=do_normalize,
@@ -321,7 +304,7 @@ def preprocess(
             # uses torch interpolation which requires ChannelDimension.FIRST
             if do_resize:
                 image = to_channel_dimension_format(image, ChannelDimension.FIRST, input_channel_dim=input_data_format)
-                image = self.resize(image=image, size=size, resample=resample, antialias=antialias)
+                image = self.resize(image=image, size=size, resample=resample)
                 image = to_channel_dimension_format(image, data_format, input_channel_dim=ChannelDimension.FIRST)
             else:
                 image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
@@ -387,7 +370,6 @@ def post_process_depth_estimation(
                     input=depth.unsqueeze(0).unsqueeze(1),
                     size=target_size,
                     mode=pil_torch_interpolation_mapping[self.resample].value,
-                    antialias=self.antialias,
                 ).squeeze()
 
             # inverse the depth

From 666f3b73616ed5c0cd16f42360e4e2018e524a1f Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 5 Feb 2025 19:40:40 +0500
Subject: [PATCH 145/151] Revert "add antialias in BaseImageProcessorFast"

This reverts commit 3ae1134780ae236872985523d9c0a444eabcc179.
---
 .../image_processing_utils_fast.py            | 25 +-------
 .../image_processing_depth_pro_fast.py        | 62 ++++++++++++++++---
 2 files changed, 56 insertions(+), 31 deletions(-)

diff --git a/src/transformers/image_processing_utils_fast.py b/src/transformers/image_processing_utils_fast.py
index 20dc06e53b3b..cb7d1c46aa79 100644
--- a/src/transformers/image_processing_utils_fast.py
+++ b/src/transformers/image_processing_utils_fast.py
@@ -132,7 +132,6 @@ class DefaultFastImageProcessorInitKwargs(TypedDict, total=False):
     size: Optional[Dict[str, int]]
     default_to_square: Optional[bool]
     resample: Optional[Union["PILImageResampling", "F.InterpolationMode"]]
-    antialias: Optional[bool]
     do_center_crop: Optional[bool]
     crop_size: Optional[Dict[str, int]]
     do_rescale: Optional[bool]
@@ -164,9 +163,6 @@ class DefaultFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorInitKwa
         resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
             Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
             overridden by the `resample` parameter in the `preprocess` method.
-        antialias (`bool`, *optional*, defaults to `True`):
-            Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with
-            bilinear or bicubic modes and it is ignored otherwise.
         do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
             Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the
             `preprocess` method.
@@ -207,9 +203,6 @@ class DefaultFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorInitKwa
         resample (`PILImageResampling` or `InterpolationMode`, *optional*, defaults to `self.resample`):
             Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
             has an effect if `do_resize` is set to `True`.
-        antialias (`bool`, *optional*, defaults to `True`):
-            Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with
-            bilinear or bicubic modes and it is ignored otherwise.
         do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
             Whether to center crop the image.
         crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`):
@@ -250,7 +243,6 @@ class DefaultFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorInitKwa
 )
 class BaseImageProcessorFast(BaseImageProcessor):
     resample = None
-    antialias = None
     image_mean = None
     image_std = None
     size = None
@@ -291,7 +283,6 @@ def resize(
         image: "torch.Tensor",
         size: SizeDict,
         interpolation: "F.InterpolationMode" = None,
-        antialias: bool = True,
         **kwargs,
     ) -> "torch.Tensor":
         """
@@ -304,15 +295,11 @@ def resize(
                 Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
             resample (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
                 `InterpolationMode` filter to use when resizing the image e.g. `InterpolationMode.BICUBIC`.
-            antialias (`bool`, *optional*, defaults to `True`):
-                Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with
-                bilinear or bicubic modes and it is ignored otherwise.
 
         Returns:
             `torch.Tensor`: The resized image.
         """
         interpolation = interpolation if interpolation is not None else F.InterpolationMode.BILINEAR
-        antialias = antialias if antialias is not None else True
         if size.shortest_edge and size.longest_edge:
             # Resize the image so that the shortest edge or the longest edge is of the given size
             # while maintaining the aspect ratio of the original image.
@@ -337,7 +324,7 @@ def resize(
                 "Size must contain 'height' and 'width' keys, or 'max_height' and 'max_width', or 'shortest_edge' key. Got"
                 f" {size}."
             )
-        return F.resize(image, new_size, interpolation=interpolation, antialias=antialias)
+        return F.resize(image, new_size, interpolation=interpolation)
 
     def rescale(
         self,
@@ -591,7 +578,6 @@ def preprocess(
         image_std = kwargs.pop("image_std")
         data_format = kwargs.pop("data_format")
         resample = kwargs.pop("resample")
-        antialias = kwargs.pop("antialias")
 
         # Make hashable for cache
         size = SizeDict(**get_size_dict(size=size, default_to_square=default_to_square)) if size is not None else None
@@ -620,7 +606,6 @@ def preprocess(
             size=size,
             crop_size=crop_size,
             interpolation=interpolation,
-            antialias=antialias,
             image_mean=image_mean,
             image_std=image_std,
             **kwargs,
@@ -632,7 +617,6 @@ def _preprocess(
         do_resize: bool,
         size: SizeDict,
         interpolation: Optional["F.InterpolationMode"],
-        antialias: Optional[bool],
         do_center_crop: bool,
         crop_size: SizeDict,
         do_rescale: bool,
@@ -647,12 +631,7 @@ def _preprocess(
         resized_images_grouped = {}
         for shape, stacked_images in grouped_images.items():
             if do_resize:
-                stacked_images = self.resize(
-                    image=stacked_images,
-                    size=size,
-                    interpolation=interpolation,
-                    antialias=antialias,
-                )
+                stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation)
             resized_images_grouped[shape] = stacked_images
         resized_images = reorder_images(resized_images_grouped, grouped_images_index)
 
diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
index 2b9870c6dc0a..cc6c3feace82 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
@@ -20,6 +20,10 @@
 from ...image_processing_utils_fast import (
     BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
     BaseImageProcessorFast,
+    ChannelDimension,
+    get_image_size_for_max_height_width,
+    get_resize_output_image_size,
+    get_size_with_aspect_ratio,
     group_images_by_shape,
     reorder_images,
 )
@@ -65,7 +69,6 @@
 )
 class DepthProImageProcessorFast(BaseImageProcessorFast):
     resample = PILImageResampling.BILINEAR
-    antialias = False
     image_mean = IMAGENET_STANDARD_MEAN
     image_std = IMAGENET_STANDARD_STD
     size = {"height": 1536, "width": 1536}
@@ -73,6 +76,55 @@ class DepthProImageProcessorFast(BaseImageProcessorFast):
     do_rescale = True
     do_normalize = True
 
+    # Only difference with BaseImageProcessorFast.resize is that `antialias=False` in F.resize
+    def resize(
+        self,
+        image: "torch.Tensor",
+        size: SizeDict,
+        interpolation: "F.InterpolationMode" = None,
+        **kwargs,
+    ) -> "torch.Tensor":
+        """
+        Resize an image to `(size["height"], size["width"])`.
+
+        Args:
+            image (`torch.Tensor`):
+                Image to resize.
+            size (`SizeDict`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            resample (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
+                `InterpolationMode` filter to use when resizing the image e.g. `InterpolationMode.BICUBIC`.
+
+        Returns:
+            `torch.Tensor`: The resized image.
+        """
+        interpolation = interpolation if interpolation is not None else F.InterpolationMode.BILINEAR
+        if size.shortest_edge and size.longest_edge:
+            # Resize the image so that the shortest edge or the longest edge is of the given size
+            # while maintaining the aspect ratio of the original image.
+            new_size = get_size_with_aspect_ratio(
+                image.size()[-2:],
+                size.shortest_edge,
+                size.longest_edge,
+            )
+        elif size.shortest_edge:
+            new_size = get_resize_output_image_size(
+                image,
+                size=size.shortest_edge,
+                default_to_square=False,
+                input_data_format=ChannelDimension.FIRST,
+            )
+        elif size.max_height and size.max_width:
+            new_size = get_image_size_for_max_height_width(image.size()[-2:], size.max_height, size.max_width)
+        elif size.height and size.width:
+            new_size = (size.height, size.width)
+        else:
+            raise ValueError(
+                "Size must contain 'height' and 'width' keys, or 'max_height' and 'max_width', or 'shortest_edge' key. Got"
+                f" {size}."
+            )
+        return F.resize(image, new_size, interpolation=interpolation, antialias=False)
+
     # DepthPro resizes image after rescaling and normalizing,
     # which makes it different from BaseImageProcessorFast._preprocess
     def _preprocess(
@@ -81,7 +133,6 @@ def _preprocess(
         do_resize: bool,
         size: SizeDict,
         interpolation: Optional["F.InterpolationMode"],
-        antialias: bool,
         do_center_crop: bool,
         crop_size: SizeDict,
         do_rescale: bool,
@@ -109,12 +160,7 @@ def _preprocess(
         resized_images_grouped = {}
         for shape, stacked_images in grouped_images.items():
             if do_resize:
-                stacked_images = self.resize(
-                    image=stacked_images,
-                    size=size,
-                    interpolation=interpolation,
-                    antialias=antialias,
-                )
+                stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation)
             resized_images_grouped[shape] = stacked_images
         resized_images = reorder_images(resized_images_grouped, grouped_images_index)
 

From 41180e37e17035eae3c1fe3da6e8bfbe583713d1 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 5 Feb 2025 19:55:44 +0500
Subject: [PATCH 146/151] update processor for grouping and antialias

---
 .../image_processing_utils_fast.py            |  3 +-
 .../image_processing_depth_pro_fast.py        | 79 +++----------------
 2 files changed, 13 insertions(+), 69 deletions(-)

diff --git a/src/transformers/image_processing_utils_fast.py b/src/transformers/image_processing_utils_fast.py
index cb7d1c46aa79..d21d35212144 100644
--- a/src/transformers/image_processing_utils_fast.py
+++ b/src/transformers/image_processing_utils_fast.py
@@ -283,6 +283,7 @@ def resize(
         image: "torch.Tensor",
         size: SizeDict,
         interpolation: "F.InterpolationMode" = None,
+        antialias: bool = True,
         **kwargs,
     ) -> "torch.Tensor":
         """
@@ -324,7 +325,7 @@ def resize(
                 "Size must contain 'height' and 'width' keys, or 'max_height' and 'max_width', or 'shortest_edge' key. Got"
                 f" {size}."
             )
-        return F.resize(image, new_size, interpolation=interpolation)
+        return F.resize(image, new_size, interpolation=interpolation, antialias=antialias)
 
     def rescale(
         self,
diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
index cc6c3feace82..43a23bf10b5e 100644
--- a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
+++ b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py
@@ -20,10 +20,6 @@
 from ...image_processing_utils_fast import (
     BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
     BaseImageProcessorFast,
-    ChannelDimension,
-    get_image_size_for_max_height_width,
-    get_resize_output_image_size,
-    get_size_with_aspect_ratio,
     group_images_by_shape,
     reorder_images,
 )
@@ -76,55 +72,6 @@ class DepthProImageProcessorFast(BaseImageProcessorFast):
     do_rescale = True
     do_normalize = True
 
-    # Only difference with BaseImageProcessorFast.resize is that `antialias=False` in F.resize
-    def resize(
-        self,
-        image: "torch.Tensor",
-        size: SizeDict,
-        interpolation: "F.InterpolationMode" = None,
-        **kwargs,
-    ) -> "torch.Tensor":
-        """
-        Resize an image to `(size["height"], size["width"])`.
-
-        Args:
-            image (`torch.Tensor`):
-                Image to resize.
-            size (`SizeDict`):
-                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
-            resample (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
-                `InterpolationMode` filter to use when resizing the image e.g. `InterpolationMode.BICUBIC`.
-
-        Returns:
-            `torch.Tensor`: The resized image.
-        """
-        interpolation = interpolation if interpolation is not None else F.InterpolationMode.BILINEAR
-        if size.shortest_edge and size.longest_edge:
-            # Resize the image so that the shortest edge or the longest edge is of the given size
-            # while maintaining the aspect ratio of the original image.
-            new_size = get_size_with_aspect_ratio(
-                image.size()[-2:],
-                size.shortest_edge,
-                size.longest_edge,
-            )
-        elif size.shortest_edge:
-            new_size = get_resize_output_image_size(
-                image,
-                size=size.shortest_edge,
-                default_to_square=False,
-                input_data_format=ChannelDimension.FIRST,
-            )
-        elif size.max_height and size.max_width:
-            new_size = get_image_size_for_max_height_width(image.size()[-2:], size.max_height, size.max_width)
-        elif size.height and size.width:
-            new_size = (size.height, size.width)
-        else:
-            raise ValueError(
-                "Size must contain 'height' and 'width' keys, or 'max_height' and 'max_width', or 'shortest_edge' key. Got"
-                f" {size}."
-            )
-        return F.resize(image, new_size, interpolation=interpolation, antialias=False)
-
     # DepthPro resizes image after rescaling and normalizing,
     # which makes it different from BaseImageProcessorFast._preprocess
     def _preprocess(
@@ -144,27 +91,23 @@ def _preprocess(
     ) -> BatchFeature:
         # Group images by size for batched scaling
         grouped_images, grouped_images_index = group_images_by_shape(images)
-        scaled_images_grouped = {}
+        processed_images_grouped = {}
         for shape, stacked_images in grouped_images.items():
-            if do_center_crop:
-                stacked_images = self.center_crop(stacked_images, crop_size)
             # Fused rescale and normalize
             stacked_images = self.rescale_and_normalize(
                 stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
             )
-            scaled_images_grouped[shape] = stacked_images
-        scaled_images = reorder_images(scaled_images_grouped, grouped_images_index)
-
-        # Group images by size for batched resizing
-        grouped_images, grouped_images_index = group_images_by_shape(scaled_images)
-        resized_images_grouped = {}
-        for shape, stacked_images in grouped_images.items():
             if do_resize:
-                stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation)
-            resized_images_grouped[shape] = stacked_images
-        resized_images = reorder_images(resized_images_grouped, grouped_images_index)
-
-        processed_images = torch.stack(resized_images, dim=0) if return_tensors else resized_images
+                stacked_images = self.resize(
+                    image=stacked_images,
+                    size=size,
+                    interpolation=interpolation,
+                    antialias=False,
+                )
+            processed_images_grouped[shape] = stacked_images
+
+        processed_images = reorder_images(processed_images_grouped, grouped_images_index)
+        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
 
         return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
 

From 1265b12d2258af67a512cdba7651b45cac8c17f5 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 5 Feb 2025 20:01:50 +0500
Subject: [PATCH 147/151] try test_fast_is_faster_than_slow without "skip" or
 "flanky"

---
 tests/models/depth_pro/test_image_processing_depth_pro.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/tests/models/depth_pro/test_image_processing_depth_pro.py b/tests/models/depth_pro/test_image_processing_depth_pro.py
index 434741b13e1b..5827512478d1 100644
--- a/tests/models/depth_pro/test_image_processing_depth_pro.py
+++ b/tests/models/depth_pro/test_image_processing_depth_pro.py
@@ -116,9 +116,3 @@ def test_image_processor_from_dict_with_kwargs(self):
 
         image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42)
         self.assertEqual(image_processor.size, {"height": 42, "width": 42})
-
-    @unittest.skip(
-        reason="both processors (fast and slow) use torch for resizing, check: https://github.com/huggingface/transformers/issues/34920",
-    )
-    def test_fast_is_faster_than_slow(self):
-        pass

From 4dc850fcb77be76dd7f2dc0fb3911beadaa8f751 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 5 Feb 2025 23:28:29 +0500
Subject: [PATCH 148/151] update checkpoint

---
 docs/source/en/model_doc/depth_pro.md                  | 10 +++++-----
 .../depth_pro/convert_depth_pro_weights_to_hf.py       |  2 +-
 .../models/depth_pro/modeling_depth_pro.py             |  4 ++--
 tests/models/depth_pro/test_modeling_depth_pro.py      |  6 +++---
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/docs/source/en/model_doc/depth_pro.md b/docs/source/en/model_doc/depth_pro.md
index 00ebed799b2e..e84f5a41a355 100644
--- a/docs/source/en/model_doc/depth_pro.md
+++ b/docs/source/en/model_doc/depth_pro.md
@@ -48,8 +48,8 @@ The DepthPro model processes an input image by first downsampling it at multiple
 >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
 >>> image = Image.open(requests.get(url, stream=True).raw)
 
->>> image_processor = DepthProImageProcessorFast.from_pretrained("geetu040/DepthPro")
->>> model = DepthProForDepthEstimation.from_pretrained("geetu040/DepthPro").to(device)
+>>> image_processor = DepthProImageProcessorFast.from_pretrained("geetu040/depth-pro-hf")
+>>> model = DepthProForDepthEstimation.from_pretrained("geetu040/depth-pro-hf").to(device)
 
 >>> inputs = image_processor(images=image, return_tensors="pt").to(device)
 
@@ -96,10 +96,10 @@ The network is supplemented with a focal length estimation head. A small convolu
 
 The `use_fov_model` parameter in `DepthProConfig` controls whether **FOV prediction** is enabled. By default, it is set to `False` to conserve memory and computation. When enabled, the **FOV encoder** is instantiated based on the `fov_model_config` parameter, which defaults to a `Dinov2Model`. The `use_fov_model` parameter can also be passed when initializing the `DepthProForDepthEstimation` model.
 
-The pretrained model at checkpoint `geetu040/DepthPro` uses the FOV encoder. To use the pretrained-model without FOV encoder, set `use_fov_model=False` when loading the model, which saves computation.
+The pretrained model at checkpoint `geetu040/depth-pro-hf` uses the FOV encoder. To use the pretrained-model without FOV encoder, set `use_fov_model=False` when loading the model, which saves computation.
 ```py
 >>> from transformers import DepthProForDepthEstimation
->>> model = DepthProForDepthEstimation.from_pretrained("geetu040/DepthPro", use_fov_model=False)
+>>> model = DepthProForDepthEstimation.from_pretrained("geetu040/depth-pro-hf", use_fov_model=False)
 ```
 
 To instantiate a new model with FOV encoder, set `use_fov_model=True` in the config.
@@ -129,7 +129,7 @@ SDPA is used by default for `torch>=2.1.1` when an implementation is available,
 
 ```py
 from transformers import DepthProForDepthEstimation
-model = DepthProForDepthEstimation.from_pretrained("geetu040/DepthPro", attn_implementation="sdpa", torch_dtype=torch.float16)
+model = DepthProForDepthEstimation.from_pretrained("geetu040/depth-pro-hf", attn_implementation="sdpa", torch_dtype=torch.float16)
 ```
 
 For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`).
diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
index f2cfc0bdd758..9a41a6aa027a 100644
--- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
+++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
@@ -229,7 +229,7 @@ def main():
     )
     parser.add_argument(
         "--hub_repo_id",
-        default="geetu040/DepthPro",
+        default="geetu040/depth-pro-hf",
         help="Huggingface hub repo to write the converted model and processor",
     )
     args = parser.parse_args()
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index e8421ab3bcea..319b782a5123 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -752,7 +752,7 @@ def forward(
         >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> checkpoint = "geetu040/DepthPro"
+        >>> checkpoint = "geetu040/depth-pro-hf"
         >>> processor = AutoProcessor.from_pretrained(checkpoint)
         >>> model = DepthProModel.from_pretrained(checkpoint)
 
@@ -1142,7 +1142,7 @@ def forward(
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> checkpoint = "geetu040/DepthPro"
+        >>> checkpoint = "geetu040/depth-pro-hf"
         >>> processor = AutoImageProcessor.from_pretrained(checkpoint)
         >>> model = DepthProForDepthEstimation.from_pretrained(checkpoint)
 
diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py
index 1e4ceadbd4eb..fc0d033c0d43 100644
--- a/tests/models/depth_pro/test_modeling_depth_pro.py
+++ b/tests/models/depth_pro/test_modeling_depth_pro.py
@@ -329,7 +329,7 @@ def test_batching_equivalence(self, atol=1e-4, rtol=1e-4):
 
     @slow
     def test_model_from_pretrained(self):
-        model_path = "geetu040/DepthPro"
+        model_path = "geetu040/depth-pro-hf"
         model = DepthProModel.from_pretrained(model_path)
         self.assertIsNotNone(model)
 
@@ -345,7 +345,7 @@ def prepare_img():
 @slow
 class DepthProModelIntegrationTest(unittest.TestCase):
     def test_inference_depth_estimation(self):
-        model_path = "geetu040/DepthPro"
+        model_path = "geetu040/depth-pro-hf"
         image_processor = DepthProImageProcessor.from_pretrained(model_path)
         model = DepthProForDepthEstimation.from_pretrained(model_path).to(torch_device)
         config = model.config
@@ -378,7 +378,7 @@ def test_inference_depth_estimation(self):
         torch.testing.assert_close(outputs.field_of_view, expected_slice, atol=1e-4, rtol=1e-4)
 
     def test_post_processing_depth_estimation(self):
-        model_path = "geetu040/DepthPro"
+        model_path = "geetu040/depth-pro-hf"
         image_processor = DepthProImageProcessor.from_pretrained(model_path)
         model = DepthProForDepthEstimation.from_pretrained(model_path)
 

From 592648c11687b2366d48f2fa721a283b6d052874 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Thu, 6 Feb 2025 08:24:09 +0500
Subject: [PATCH 149/151] update checkpoint

---
 docs/source/en/model_doc/depth_pro.md                  | 10 +++++-----
 .../depth_pro/convert_depth_pro_weights_to_hf.py       |  2 +-
 .../models/depth_pro/modeling_depth_pro.py             |  4 ++--
 tests/models/depth_pro/test_modeling_depth_pro.py      |  6 +++---
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/docs/source/en/model_doc/depth_pro.md b/docs/source/en/model_doc/depth_pro.md
index e84f5a41a355..9ac15c6081d4 100644
--- a/docs/source/en/model_doc/depth_pro.md
+++ b/docs/source/en/model_doc/depth_pro.md
@@ -48,8 +48,8 @@ The DepthPro model processes an input image by first downsampling it at multiple
 >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
 >>> image = Image.open(requests.get(url, stream=True).raw)
 
->>> image_processor = DepthProImageProcessorFast.from_pretrained("geetu040/depth-pro-hf")
->>> model = DepthProForDepthEstimation.from_pretrained("geetu040/depth-pro-hf").to(device)
+>>> image_processor = DepthProImageProcessorFast.from_pretrained("apple/depth-pro-hf")
+>>> model = DepthProForDepthEstimation.from_pretrained("apple/depth-pro-hf").to(device)
 
 >>> inputs = image_processor(images=image, return_tensors="pt").to(device)
 
@@ -96,10 +96,10 @@ The network is supplemented with a focal length estimation head. A small convolu
 
 The `use_fov_model` parameter in `DepthProConfig` controls whether **FOV prediction** is enabled. By default, it is set to `False` to conserve memory and computation. When enabled, the **FOV encoder** is instantiated based on the `fov_model_config` parameter, which defaults to a `Dinov2Model`. The `use_fov_model` parameter can also be passed when initializing the `DepthProForDepthEstimation` model.
 
-The pretrained model at checkpoint `geetu040/depth-pro-hf` uses the FOV encoder. To use the pretrained-model without FOV encoder, set `use_fov_model=False` when loading the model, which saves computation.
+The pretrained model at checkpoint `apple/depth-pro-hf` uses the FOV encoder. To use the pretrained-model without FOV encoder, set `use_fov_model=False` when loading the model, which saves computation.
 ```py
 >>> from transformers import DepthProForDepthEstimation
->>> model = DepthProForDepthEstimation.from_pretrained("geetu040/depth-pro-hf", use_fov_model=False)
+>>> model = DepthProForDepthEstimation.from_pretrained("apple/depth-pro-hf", use_fov_model=False)
 ```
 
 To instantiate a new model with FOV encoder, set `use_fov_model=True` in the config.
@@ -129,7 +129,7 @@ SDPA is used by default for `torch>=2.1.1` when an implementation is available,
 
 ```py
 from transformers import DepthProForDepthEstimation
-model = DepthProForDepthEstimation.from_pretrained("geetu040/depth-pro-hf", attn_implementation="sdpa", torch_dtype=torch.float16)
+model = DepthProForDepthEstimation.from_pretrained("apple/depth-pro-hf", attn_implementation="sdpa", torch_dtype=torch.float16)
 ```
 
 For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`).
diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
index 9a41a6aa027a..ec8732f80616 100644
--- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
+++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
@@ -229,7 +229,7 @@ def main():
     )
     parser.add_argument(
         "--hub_repo_id",
-        default="geetu040/depth-pro-hf",
+        default="apple/depth-pro-hf",
         help="Huggingface hub repo to write the converted model and processor",
     )
     args = parser.parse_args()
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 319b782a5123..3ba78dc2ad67 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -752,7 +752,7 @@ def forward(
         >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> checkpoint = "geetu040/depth-pro-hf"
+        >>> checkpoint = "apple/depth-pro-hf"
         >>> processor = AutoProcessor.from_pretrained(checkpoint)
         >>> model = DepthProModel.from_pretrained(checkpoint)
 
@@ -1142,7 +1142,7 @@ def forward(
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> checkpoint = "geetu040/depth-pro-hf"
+        >>> checkpoint = "apple/depth-pro-hf"
         >>> processor = AutoImageProcessor.from_pretrained(checkpoint)
         >>> model = DepthProForDepthEstimation.from_pretrained(checkpoint)
 
diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py
index fc0d033c0d43..62b7465cac54 100644
--- a/tests/models/depth_pro/test_modeling_depth_pro.py
+++ b/tests/models/depth_pro/test_modeling_depth_pro.py
@@ -329,7 +329,7 @@ def test_batching_equivalence(self, atol=1e-4, rtol=1e-4):
 
     @slow
     def test_model_from_pretrained(self):
-        model_path = "geetu040/depth-pro-hf"
+        model_path = "apple/depth-pro-hf"
         model = DepthProModel.from_pretrained(model_path)
         self.assertIsNotNone(model)
 
@@ -345,7 +345,7 @@ def prepare_img():
 @slow
 class DepthProModelIntegrationTest(unittest.TestCase):
     def test_inference_depth_estimation(self):
-        model_path = "geetu040/depth-pro-hf"
+        model_path = "apple/depth-pro-hf"
         image_processor = DepthProImageProcessor.from_pretrained(model_path)
         model = DepthProForDepthEstimation.from_pretrained(model_path).to(torch_device)
         config = model.config
@@ -378,7 +378,7 @@ def test_inference_depth_estimation(self):
         torch.testing.assert_close(outputs.field_of_view, expected_slice, atol=1e-4, rtol=1e-4)
 
     def test_post_processing_depth_estimation(self):
-        model_path = "geetu040/depth-pro-hf"
+        model_path = "apple/depth-pro-hf"
         image_processor = DepthProImageProcessor.from_pretrained(model_path)
         model = DepthProForDepthEstimation.from_pretrained(model_path)
 

From 162f14166f2ebd36772967da51068cad846bc8e3 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Thu, 6 Feb 2025 16:33:02 +0500
Subject: [PATCH 150/151] use @is_flanky for processor test

---
 tests/models/depth_pro/test_image_processing_depth_pro.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tests/models/depth_pro/test_image_processing_depth_pro.py b/tests/models/depth_pro/test_image_processing_depth_pro.py
index 5827512478d1..b30931a86cdb 100644
--- a/tests/models/depth_pro/test_image_processing_depth_pro.py
+++ b/tests/models/depth_pro/test_image_processing_depth_pro.py
@@ -16,7 +16,7 @@
 
 import unittest
 
-from transformers.testing_utils import require_torch, require_vision
+from transformers.testing_utils import is_flaky, require_torch, require_vision
 from transformers.utils import is_torchvision_available, is_vision_available
 
 from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
@@ -116,3 +116,9 @@ def test_image_processor_from_dict_with_kwargs(self):
 
         image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42)
         self.assertEqual(image_processor.size, {"height": 42, "width": 42})
+
+    @is_flaky(
+        description="fast and slow, both processors use torch implementation, see: https://github.com/huggingface/transformers/issues/34920",
+    )
+    def test_fast_is_faster_than_slow(self):
+        super().test_fast_is_faster_than_slow()

From 4b762390086fde562f93ab04bf64b9f251e4c9de Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Fri, 7 Feb 2025 23:15:59 +0500
Subject: [PATCH 151/151] update checkpoint to "apple/DepthPro-hf"

---
 docs/source/en/model_doc/depth_pro.md                  | 10 +++++-----
 .../depth_pro/convert_depth_pro_weights_to_hf.py       |  2 +-
 .../models/depth_pro/modeling_depth_pro.py             |  4 ++--
 tests/models/depth_pro/test_modeling_depth_pro.py      |  6 +++---
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/docs/source/en/model_doc/depth_pro.md b/docs/source/en/model_doc/depth_pro.md
index 9ac15c6081d4..2447b7d93dd5 100644
--- a/docs/source/en/model_doc/depth_pro.md
+++ b/docs/source/en/model_doc/depth_pro.md
@@ -48,8 +48,8 @@ The DepthPro model processes an input image by first downsampling it at multiple
 >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
 >>> image = Image.open(requests.get(url, stream=True).raw)
 
->>> image_processor = DepthProImageProcessorFast.from_pretrained("apple/depth-pro-hf")
->>> model = DepthProForDepthEstimation.from_pretrained("apple/depth-pro-hf").to(device)
+>>> image_processor = DepthProImageProcessorFast.from_pretrained("apple/DepthPro-hf")
+>>> model = DepthProForDepthEstimation.from_pretrained("apple/DepthPro-hf").to(device)
 
 >>> inputs = image_processor(images=image, return_tensors="pt").to(device)
 
@@ -96,10 +96,10 @@ The network is supplemented with a focal length estimation head. A small convolu
 
 The `use_fov_model` parameter in `DepthProConfig` controls whether **FOV prediction** is enabled. By default, it is set to `False` to conserve memory and computation. When enabled, the **FOV encoder** is instantiated based on the `fov_model_config` parameter, which defaults to a `Dinov2Model`. The `use_fov_model` parameter can also be passed when initializing the `DepthProForDepthEstimation` model.
 
-The pretrained model at checkpoint `apple/depth-pro-hf` uses the FOV encoder. To use the pretrained-model without FOV encoder, set `use_fov_model=False` when loading the model, which saves computation.
+The pretrained model at checkpoint `apple/DepthPro-hf` uses the FOV encoder. To use the pretrained-model without FOV encoder, set `use_fov_model=False` when loading the model, which saves computation.
 ```py
 >>> from transformers import DepthProForDepthEstimation
->>> model = DepthProForDepthEstimation.from_pretrained("apple/depth-pro-hf", use_fov_model=False)
+>>> model = DepthProForDepthEstimation.from_pretrained("apple/DepthPro-hf", use_fov_model=False)
 ```
 
 To instantiate a new model with FOV encoder, set `use_fov_model=True` in the config.
@@ -129,7 +129,7 @@ SDPA is used by default for `torch>=2.1.1` when an implementation is available,
 
 ```py
 from transformers import DepthProForDepthEstimation
-model = DepthProForDepthEstimation.from_pretrained("apple/depth-pro-hf", attn_implementation="sdpa", torch_dtype=torch.float16)
+model = DepthProForDepthEstimation.from_pretrained("apple/DepthPro-hf", attn_implementation="sdpa", torch_dtype=torch.float16)
 ```
 
 For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`).
diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
index ec8732f80616..b24c6a5174f0 100644
--- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
+++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
@@ -229,7 +229,7 @@ def main():
     )
     parser.add_argument(
         "--hub_repo_id",
-        default="apple/depth-pro-hf",
+        default="apple/DepthPro-hf",
         help="Huggingface hub repo to write the converted model and processor",
     )
     args = parser.parse_args()
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 3ba78dc2ad67..67715723d133 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -752,7 +752,7 @@ def forward(
         >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> checkpoint = "apple/depth-pro-hf"
+        >>> checkpoint = "apple/DepthPro-hf"
         >>> processor = AutoProcessor.from_pretrained(checkpoint)
         >>> model = DepthProModel.from_pretrained(checkpoint)
 
@@ -1142,7 +1142,7 @@ def forward(
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> checkpoint = "apple/depth-pro-hf"
+        >>> checkpoint = "apple/DepthPro-hf"
         >>> processor = AutoImageProcessor.from_pretrained(checkpoint)
         >>> model = DepthProForDepthEstimation.from_pretrained(checkpoint)
 
diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py
index 62b7465cac54..44529270fd94 100644
--- a/tests/models/depth_pro/test_modeling_depth_pro.py
+++ b/tests/models/depth_pro/test_modeling_depth_pro.py
@@ -329,7 +329,7 @@ def test_batching_equivalence(self, atol=1e-4, rtol=1e-4):
 
     @slow
     def test_model_from_pretrained(self):
-        model_path = "apple/depth-pro-hf"
+        model_path = "apple/DepthPro-hf"
         model = DepthProModel.from_pretrained(model_path)
         self.assertIsNotNone(model)
 
@@ -345,7 +345,7 @@ def prepare_img():
 @slow
 class DepthProModelIntegrationTest(unittest.TestCase):
     def test_inference_depth_estimation(self):
-        model_path = "apple/depth-pro-hf"
+        model_path = "apple/DepthPro-hf"
         image_processor = DepthProImageProcessor.from_pretrained(model_path)
         model = DepthProForDepthEstimation.from_pretrained(model_path).to(torch_device)
         config = model.config
@@ -378,7 +378,7 @@ def test_inference_depth_estimation(self):
         torch.testing.assert_close(outputs.field_of_view, expected_slice, atol=1e-4, rtol=1e-4)
 
     def test_post_processing_depth_estimation(self):
-        model_path = "apple/depth-pro-hf"
+        model_path = "apple/DepthPro-hf"
         image_processor = DepthProImageProcessor.from_pretrained(model_path)
         model = DepthProForDepthEstimation.from_pretrained(model_path)